Systemd/src/core/execute.c

/* SPDX-License-Identifier: LGPL-2.1+ */

#include <errno.h>
#include <fcntl.h>
#include <glob.h>
#include <grp.h>
#include <poll.h>
#include <signal.h>
#include <string.h>
#include <sys/capability.h>
#include <sys/eventfd.h>
#include <sys/mman.h>
#include <sys/personality.h>
#include <sys/prctl.h>
#include <sys/shm.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/un.h>
#include <unistd.h>
#include <utmpx.h>

#if HAVE_PAM
#include <security/pam_appl.h>
#endif

#if HAVE_SELINUX
#include <selinux/selinux.h>
#endif

#if HAVE_SECCOMP
#include <seccomp.h>
#endif

#if HAVE_APPARMOR
#include <sys/apparmor.h>
#endif

#include "sd-messages.h"

#include "af-list.h"
#include "alloc-util.h"
#if HAVE_APPARMOR
#include "apparmor-util.h"
#endif
#include "async.h"
#include "barrier.h"
#include "cap-list.h"
#include "capability-util.h"
#include "chown-recursive.h"
#include "cpu-set-util.h"
#include "def.h"
#include "env-util.h"
#include "errno-list.h"
#include "execute.h"
#include "exit-status.h"
#include "fd-util.h"
#include "fileio.h"
#include "format-util.h"
#include "fs-util.h"
#include "glob-util.h"
#include "io-util.h"
#include "ioprio.h"
#include "label.h"
#include "log.h"
#include "macro.h"
#include "manager.h"
#include "missing.h"
#include "mkdir.h"
#include "namespace.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
#include "rlimit-util.h"
#include "rm-rf.h"
#if HAVE_SECCOMP
#include "seccomp-util.h"
#endif
#include "securebits.h"
#include "securebits-util.h"
#include "selinux-util.h"
#include "signal-util.h"
#include "smack-util.h"
#include "socket-util.h"
#include "special.h"
#include "stat-util.h"
#include "string-table.h"
#include "string-util.h"
#include "strv.h"
#include "syslog-util.h"
#include "terminal-util.h"
#include "umask-util.h"
#include "unit.h"
#include "user-util.h"
#include "util.h"
#include "utmp-wtmp.h"

#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)

/* This assumes there is a 'tty' group */
#define TTY_MODE 0620

#define SNDBUF_SIZE (8*1024*1024)

static int shift_fds(int fds[], size_t n_fds) {
        int start, restart_from;

        if (n_fds <= 0)
                return 0;

        /* Modifies the fds array! (sorts it) */

        assert(fds);

        start = 0;
        for (;;) {
                int i;

                restart_from = -1;

                for (i = start; i < (int) n_fds; i++) {
                        int nfd;

                        /* Already at right index? */
                        if (fds[i] == i+3)
                                continue;

                        nfd = fcntl(fds[i], F_DUPFD, i + 3);
                        if (nfd < 0)
                                return -errno;

                        safe_close(fds[i]);
                        fds[i] = nfd;

                        /* Hmm, the fd we wanted isn't free? Then
                         * let's remember that and try again from here */
                        if (nfd != i+3 && restart_from < 0)
                                restart_from = i;
                }

                if (restart_from < 0)
                        break;

                start = restart_from;
        }

        return 0;
}

static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
        size_t i, n_fds;
        int r;

        n_fds = n_socket_fds + n_storage_fds;
        if (n_fds <= 0)
                return 0;

        assert(fds);

        /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
         * O_NONBLOCK only applies to socket activation though. */

        for (i = 0; i < n_fds; i++) {

                if (i < n_socket_fds) {
                        r = fd_nonblock(fds[i], nonblock);
                        if (r < 0)
                                return r;
                }

                /* We unconditionally drop FD_CLOEXEC from the fds,
                 * since after all we want to pass these fds to our
                 * children */

                r = fd_cloexec(fds[i], false);
                if (r < 0)
                        return r;
        }

        return 0;
}

static const char *exec_context_tty_path(const ExecContext *context) {
        assert(context);

        if (context->stdio_as_fds)
                return NULL;

        if (context->tty_path)
                return context->tty_path;

        return "/dev/console";
}

static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
        const char *path;

        assert(context);

        path = exec_context_tty_path(context);

        if (context->tty_vhangup) {
                if (p && p->stdin_fd >= 0)
                        (void) terminal_vhangup_fd(p->stdin_fd);
                else if (path)
                        (void) terminal_vhangup(path);
        }

        if (context->tty_reset) {
                if (p && p->stdin_fd >= 0)
                        (void) reset_terminal_fd(p->stdin_fd, true);
                else if (path)
                        (void) reset_terminal(path);
        }

        if (context->tty_vt_disallocate && path)
                (void) vt_disallocate(path);
}

static bool is_terminal_input(ExecInput i) {
        return IN_SET(i,
                      EXEC_INPUT_TTY,
                      EXEC_INPUT_TTY_FORCE,
                      EXEC_INPUT_TTY_FAIL);
}

static bool is_terminal_output(ExecOutput o) {
        return IN_SET(o,
                      EXEC_OUTPUT_TTY,
                      EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
                      EXEC_OUTPUT_KMSG_AND_CONSOLE,
                      EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
}

static bool is_syslog_output(ExecOutput o) {
        return IN_SET(o,
                      EXEC_OUTPUT_SYSLOG,
                      EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
}

static bool is_kmsg_output(ExecOutput o) {
        return IN_SET(o,
                      EXEC_OUTPUT_KMSG,
                      EXEC_OUTPUT_KMSG_AND_CONSOLE);
}

static bool exec_context_needs_term(const ExecContext *c) {
        assert(c);

        /* Return true if the execution context suggests we should set $TERM to something useful. */

        if (is_terminal_input(c->std_input))
                return true;

        if (is_terminal_output(c->std_output))
                return true;

        if (is_terminal_output(c->std_error))
                return true;

        return !!c->tty_path;
}

static int open_null_as(int flags, int nfd) {
        int fd;

        assert(nfd >= 0);

        fd = open("/dev/null", flags|O_NOCTTY);
        if (fd < 0)
                return -errno;

        return move_fd(fd, nfd, false);
}

static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
        static const union sockaddr_union sa = {
                .un.sun_family = AF_UNIX,
                .un.sun_path = "/run/systemd/journal/stdout",
        };
        uid_t olduid = UID_INVALID;
        gid_t oldgid = GID_INVALID;
        int r;

        if (gid_is_valid(gid)) {
                oldgid = getgid();

                if (setegid(gid) < 0)
                        return -errno;
        }

        if (uid_is_valid(uid)) {
                olduid = getuid();

                if (seteuid(uid) < 0) {
                        r = -errno;
                        goto restore_gid;
                }
        }

        r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;

        /* If we fail to restore the uid or gid, things will likely
           fail later on. This should only happen if an LSM interferes. */

        if (uid_is_valid(uid))
                (void) seteuid(olduid);

 restore_gid:
        if (gid_is_valid(gid))
                (void) setegid(oldgid);

        return r;
}

static int connect_logger_as(
                const Unit *unit,
                const ExecContext *context,
                const ExecParameters *params,
                ExecOutput output,
                const char *ident,
                int nfd,
                uid_t uid,
                gid_t gid) {

        int fd, r;

        assert(context);
        assert(params);
        assert(output < _EXEC_OUTPUT_MAX);
        assert(ident);
        assert(nfd >= 0);

        fd = socket(AF_UNIX, SOCK_STREAM, 0);
        if (fd < 0)
                return -errno;

        r = connect_journal_socket(fd, uid, gid);
        if (r < 0)
                return r;

        if (shutdown(fd, SHUT_RD) < 0) {
                safe_close(fd);
                return -errno;
        }

        (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);

        dprintf(fd,
                "%s\n"
                "%s\n"
                "%i\n"
                "%i\n"
                "%i\n"
                "%i\n"
                "%i\n",
                context->syslog_identifier ?: ident,
                params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
                context->syslog_priority,
                !!context->syslog_level_prefix,
                is_syslog_output(output),
                is_kmsg_output(output),
                is_terminal_output(output));

        return move_fd(fd, nfd, false);
}
static int open_terminal_as(const char *path, int flags, int nfd) {
        int fd;

        assert(path);
        assert(nfd >= 0);

        fd = open_terminal(path, flags | O_NOCTTY);
        if (fd < 0)
                return fd;

        return move_fd(fd, nfd, false);
}

static int acquire_path(const char *path, int flags, mode_t mode) {
        union sockaddr_union sa = {
                .sa.sa_family = AF_UNIX,
        };
        int fd, r;

        assert(path);

        if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
                flags |= O_CREAT;

        fd = open(path, flags|O_NOCTTY, mode);
        if (fd >= 0)
                return fd;

        if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
                return -errno;
        if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
                return -ENXIO;

        /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */

        fd = socket(AF_UNIX, SOCK_STREAM, 0);
        if (fd < 0)
                return -errno;

        strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
        if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
                safe_close(fd);
                return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
                                                           * indication that his wasn't an AF_UNIX socket after all */
        }

        if ((flags & O_ACCMODE) == O_RDONLY)
                r = shutdown(fd, SHUT_WR);
        else if ((flags & O_ACCMODE) == O_WRONLY)
                r = shutdown(fd, SHUT_RD);
        else
                return fd;
        if (r < 0) {
                safe_close(fd);
                return -errno;
        }

        return fd;
}

static int fixup_input(
                const ExecContext *context,
                int socket_fd,
                bool apply_tty_stdin) {

        ExecInput std_input;

        assert(context);

        std_input = context->std_input;

        if (is_terminal_input(std_input) && !apply_tty_stdin)
                return EXEC_INPUT_NULL;

        if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
                return EXEC_INPUT_NULL;

        if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
                return EXEC_INPUT_NULL;

        return std_input;
}

static int fixup_output(ExecOutput std_output, int socket_fd) {

        if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
                return EXEC_OUTPUT_INHERIT;

        return std_output;
}

static int setup_input(
                const ExecContext *context,
                const ExecParameters *params,
                int socket_fd,
                int named_iofds[3]) {

        ExecInput i;

        assert(context);
        assert(params);

        if (params->stdin_fd >= 0) {
                if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
                        return -errno;

                /* Try to make this the controlling tty, if it is a tty, and reset it */
                if (isatty(STDIN_FILENO)) {
                        (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
                        (void) reset_terminal_fd(STDIN_FILENO, true);
                }

                return STDIN_FILENO;
        }

        i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);

        switch (i) {

        case EXEC_INPUT_NULL:
                return open_null_as(O_RDONLY, STDIN_FILENO);

        case EXEC_INPUT_TTY:
        case EXEC_INPUT_TTY_FORCE:
        case EXEC_INPUT_TTY_FAIL: {
                int fd;

                fd = acquire_terminal(exec_context_tty_path(context),
                                      i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
                                      i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
                                                                  ACQUIRE_TERMINAL_WAIT,
                                      USEC_INFINITY);
                if (fd < 0)
                        return fd;

                return move_fd(fd, STDIN_FILENO, false);
        }

        case EXEC_INPUT_SOCKET:
                assert(socket_fd >= 0);

                return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;

        case EXEC_INPUT_NAMED_FD:
                assert(named_iofds[STDIN_FILENO] >= 0);

                (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
                return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;

        case EXEC_INPUT_DATA: {
                int fd;

                fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
                if (fd < 0)
                        return fd;

                return move_fd(fd, STDIN_FILENO, false);
        }

        case EXEC_INPUT_FILE: {
                bool rw;
                int fd;

                assert(context->stdio_file[STDIN_FILENO]);

                rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
                        (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));

                fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
                if (fd < 0)
                        return fd;

                return move_fd(fd, STDIN_FILENO, false);
        }

        default:
                assert_not_reached("Unknown input type");
        }
}

static int setup_output(
                const Unit *unit,
                const ExecContext *context,
                const ExecParameters *params,
                int fileno,
                int socket_fd,
                int named_iofds[3],
                const char *ident,
                uid_t uid,
                gid_t gid,
                dev_t *journal_stream_dev,
                ino_t *journal_stream_ino) {

        ExecOutput o;
        ExecInput i;
        int r;

        assert(unit);
        assert(context);
        assert(params);
        assert(ident);
        assert(journal_stream_dev);
        assert(journal_stream_ino);

        if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {

                if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
                        return -errno;

                return STDOUT_FILENO;
        }

        if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
                if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
                        return -errno;

                return STDERR_FILENO;
        }

        i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
        o = fixup_output(context->std_output, socket_fd);

        if (fileno == STDERR_FILENO) {
                ExecOutput e;
                e = fixup_output(context->std_error, socket_fd);

                /* This expects the input and output are already set up */

                /* Don't change the stderr file descriptor if we inherit all
                 * the way and are not on a tty */
                if (e == EXEC_OUTPUT_INHERIT &&
                    o == EXEC_OUTPUT_INHERIT &&
                    i == EXEC_INPUT_NULL &&
                    !is_terminal_input(context->std_input) &&
                    getppid () != 1)
                        return fileno;

                /* Duplicate from stdout if possible */
                if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
                        return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;

                o = e;

        } else if (o == EXEC_OUTPUT_INHERIT) {
                /* If input got downgraded, inherit the original value */
                if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
                        return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);

                /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
                if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
                        return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;

                /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
                if (getppid() != 1)
                        return fileno;

                /* We need to open /dev/null here anew, to get the right access mode. */
                return open_null_as(O_WRONLY, fileno);
        }

        switch (o) {

        case EXEC_OUTPUT_NULL:
                return open_null_as(O_WRONLY, fileno);

        case EXEC_OUTPUT_TTY:
                if (is_terminal_input(i))
                        return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;

                /* We don't reset the terminal if this is just about output */
                return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);

        case EXEC_OUTPUT_SYSLOG:
        case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
        case EXEC_OUTPUT_KMSG:
        case EXEC_OUTPUT_KMSG_AND_CONSOLE:
        case EXEC_OUTPUT_JOURNAL:
        case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
                r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
                if (r < 0) {
                        log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
                        r = open_null_as(O_WRONLY, fileno);
                } else {
                        struct stat st;

                        /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
                         * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
                         * services to detect whether they are connected to the journal or not.
                         *
                         * If both stdout and stderr are connected to a stream then let's make sure to store the data
                         * about STDERR as that's usually the best way to do logging. */

                        if (fstat(fileno, &st) >= 0 &&
                            (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
                                *journal_stream_dev = st.st_dev;
                                *journal_stream_ino = st.st_ino;
                        }
                }
                return r;

        case EXEC_OUTPUT_SOCKET:
                assert(socket_fd >= 0);

                return dup2(socket_fd, fileno) < 0 ? -errno : fileno;

        case EXEC_OUTPUT_NAMED_FD:
                assert(named_iofds[fileno] >= 0);

                (void) fd_nonblock(named_iofds[fileno], false);
                return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;

        case EXEC_OUTPUT_FILE:
        case EXEC_OUTPUT_FILE_APPEND: {
                bool rw;
                int fd, flags;

                assert(context->stdio_file[fileno]);

                rw = context->std_input == EXEC_INPUT_FILE &&
                        streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);

                if (rw)
                        return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;

                flags = O_WRONLY;
                if (o == EXEC_OUTPUT_FILE_APPEND)
                        flags |= O_APPEND;

                fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);

                if (fd < 0)
                        return fd;

                return move_fd(fd, fileno, 0);
        }

        default:
                assert_not_reached("Unknown error type");
        }
}

static int chown_terminal(int fd, uid_t uid) {
        struct stat st;

        assert(fd >= 0);

        /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
        if (isatty(fd) < 1)
                return 0;

        /* This might fail. What matters are the results. */
        (void) fchown(fd, uid, -1);
        (void) fchmod(fd, TTY_MODE);

        if (fstat(fd, &st) < 0)
                return -errno;

        if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
                return -EPERM;

        return 0;
}

static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
        _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
        int r;

        assert(_saved_stdin);
        assert(_saved_stdout);

        saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
        if (saved_stdin < 0)
                return -errno;

        saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
        if (saved_stdout < 0)
                return -errno;

        fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
        if (fd < 0)
                return fd;

        r = chown_terminal(fd, getuid());
        if (r < 0)
                return r;

        r = reset_terminal_fd(fd, true);
        if (r < 0)
                return r;

        r = rearrange_stdio(fd, fd, STDERR_FILENO);
        fd = -1;
        if (r < 0)
                return r;

        *_saved_stdin = saved_stdin;
        *_saved_stdout = saved_stdout;

        saved_stdin = saved_stdout = -1;

        return 0;
}

static void write_confirm_error_fd(int err, int fd, const Unit *u) {
        assert(err < 0);

        if (err == -ETIMEDOUT)
                dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
        else {
                errno = -err;
                dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
        }
}

static void write_confirm_error(int err, const char *vc, const Unit *u) {
        _cleanup_close_ int fd = -1;

        assert(vc);

        fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
        if (fd < 0)
                return;

        write_confirm_error_fd(err, fd, u);
}

static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
        int r = 0;

        assert(saved_stdin);
        assert(saved_stdout);

        release_terminal();

        if (*saved_stdin >= 0)
                if (dup2(*saved_stdin, STDIN_FILENO) < 0)
                        r = -errno;

        if (*saved_stdout >= 0)
                if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
                        r = -errno;

        *saved_stdin = safe_close(*saved_stdin);
        *saved_stdout = safe_close(*saved_stdout);

        return r;
}

enum {
        CONFIRM_PRETEND_FAILURE = -1,
        CONFIRM_PRETEND_SUCCESS =  0,
        CONFIRM_EXECUTE = 1,
};

static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
        int saved_stdout = -1, saved_stdin = -1, r;
        _cleanup_free_ char *e = NULL;
        char c;

        /* For any internal errors, assume a positive response. */
        r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
        if (r < 0) {
                write_confirm_error(r, vc, u);
                return CONFIRM_EXECUTE;
        }

        /* confirm_spawn might have been disabled while we were sleeping. */
        if (manager_is_confirm_spawn_disabled(u->manager)) {
                r = 1;
                goto restore_stdio;
        }

        e = ellipsize(cmdline, 60, 100);
        if (!e) {
                log_oom();
                r = CONFIRM_EXECUTE;
                goto restore_stdio;
        }

        for (;;) {
                r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
                if (r < 0) {
                        write_confirm_error_fd(r, STDOUT_FILENO, u);
                        r = CONFIRM_EXECUTE;
                        goto restore_stdio;
                }

                switch (c) {
                case 'c':
                        printf("Resuming normal execution.\n");
                        manager_disable_confirm_spawn();
                        r = 1;
                        break;
                case 'D':
                        unit_dump(u, stdout, "  ");
                        continue; /* ask again */
                case 'f':
                        printf("Failing execution.\n");
                        r = CONFIRM_PRETEND_FAILURE;
                        break;
                case 'h':
                        printf("  c - continue, proceed without asking anymore\n"
                               "  D - dump, show the state of the unit\n"
                               "  f - fail, don't execute the command and pretend it failed\n"
                               "  h - help\n"
                               "  i - info, show a short summary of the unit\n"
                               "  j - jobs, show jobs that are in progress\n"
                               "  s - skip, don't execute the command and pretend it succeeded\n"
                               "  y - yes, execute the command\n");
                        continue; /* ask again */
                case 'i':
                        printf("  Description: %s\n"
                               "  Unit:        %s\n"
                               "  Command:     %s\n",
                               u->id, u->description, cmdline);
                        continue; /* ask again */
                case 'j':
                        manager_dump_jobs(u->manager, stdout, "  ");
                        continue; /* ask again */
                case 'n':
                        /* 'n' was removed in favor of 'f'. */
                        printf("Didn't understand 'n', did you mean 'f'?\n");
                        continue; /* ask again */
                case 's':
                        printf("Skipping execution.\n");
                        r = CONFIRM_PRETEND_SUCCESS;
                        break;
                case 'y':
                        r = CONFIRM_EXECUTE;
                        break;
                default:
                        assert_not_reached("Unhandled choice");
                }
                break;
        }

restore_stdio:
        restore_confirm_stdio(&saved_stdin, &saved_stdout);
        return r;
}

static int get_fixed_user(const ExecContext *c, const char **user,
                          uid_t *uid, gid_t *gid,
                          const char **home, const char **shell) {
        int r;
        const char *name;

        assert(c);

        if (!c->user)
                return 0;

        /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
         * (i.e. are "/" or "/bin/nologin"). */

        name = c->user;
        r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
        if (r < 0)
                return r;

        *user = name;
        return 0;
}

static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
        int r;
        const char *name;

        assert(c);

        if (!c->group)
                return 0;

        name = c->group;
        r = get_group_creds(&name, gid, 0);
        if (r < 0)
                return r;

        *group = name;
        return 0;
}

static int get_supplementary_groups(const ExecContext *c, const char *user,
                                    const char *group, gid_t gid,
                                    gid_t **supplementary_gids, int *ngids) {
        char **i;
        int r, k = 0;
        int ngroups_max;
        bool keep_groups = false;
        gid_t *groups = NULL;
        _cleanup_free_ gid_t *l_gids = NULL;

        assert(c);

        /*
         * If user is given, then lookup GID and supplementary groups list.
         * We avoid NSS lookups for gid=0. Also we have to initialize groups
         * here and as early as possible so we keep the list of supplementary
         * groups of the caller.
         */
        if (user && gid_is_valid(gid) && gid != 0) {
                /* First step, initialize groups from /etc/groups */
                if (initgroups(user, gid) < 0)
                        return -errno;

                keep_groups = true;
        }

        if (strv_isempty(c->supplementary_groups))
                return 0;

        /*
         * If SupplementaryGroups= was passed then NGROUPS_MAX has to
         * be positive, otherwise fail.
         */
        errno = 0;
        ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
        if (ngroups_max <= 0) {
                if (errno > 0)
                        return -errno;
                else
                        return -EOPNOTSUPP; /* For all other values */
        }

        l_gids = new(gid_t, ngroups_max);
        if (!l_gids)
                return -ENOMEM;

        if (keep_groups) {
                /*
                 * Lookup the list of groups that the user belongs to, we
                 * avoid NSS lookups here too for gid=0.
                 */
                k = ngroups_max;
                if (getgrouplist(user, gid, l_gids, &k) < 0)
                        return -EINVAL;
        } else
                k = 0;

        STRV_FOREACH(i, c->supplementary_groups) {
                const char *g;

                if (k >= ngroups_max)
                        return -E2BIG;

                g = *i;
                r = get_group_creds(&g, l_gids+k, 0);
                if (r < 0)
                        return r;

                k++;
        }

        /*
         * Sets ngids to zero to drop all supplementary groups, happens
         * when we are under root and SupplementaryGroups= is empty.
         */
        if (k == 0) {
                *ngids = 0;
                return 0;
        }

        /* Otherwise get the final list of supplementary groups */
        groups = memdup(l_gids, sizeof(gid_t) * k);
        if (!groups)
                return -ENOMEM;

        *supplementary_gids = groups;
        *ngids = k;

        groups = NULL;

        return 0;
}

static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
        int r;

        /* Handle SupplementaryGroups= if it is not empty */
        if (ngids > 0) {
                r = maybe_setgroups(ngids, supplementary_gids);
                if (r < 0)
                        return r;
        }

        if (gid_is_valid(gid)) {
                /* Then set our gids */
                if (setresgid(gid, gid, gid) < 0)
                        return -errno;
        }

        return 0;
}

static int enforce_user(const ExecContext *context, uid_t uid) {
        assert(context);

        if (!uid_is_valid(uid))
                return 0;

        /* Sets (but doesn't look up) the uid and make sure we keep the
         * capabilities while doing so. */

        if (context->capability_ambient_set != 0) {

                /* First step: If we need to keep capabilities but
                 * drop privileges we need to make sure we keep our
                 * caps, while we drop privileges. */
                if (uid != 0) {
                        int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;

                        if (prctl(PR_GET_SECUREBITS) != sb)
                                if (prctl(PR_SET_SECUREBITS, sb) < 0)
                                        return -errno;
                }
        }

        /* Second step: actually set the uids */
        if (setresuid(uid, uid, uid) < 0)
                return -errno;

        /* At this point we should have all necessary capabilities but
           are otherwise a normal user. However, the caps might got
           corrupted due to the setresuid() so we need clean them up
           later. This is done outside of this call. */

        return 0;
}

#if HAVE_PAM

static int null_conv(
                int num_msg,
                const struct pam_message **msg,
                struct pam_response **resp,
                void *appdata_ptr) {

        /* We don't support conversations */

        return PAM_CONV_ERR;
}

#endif

static int setup_pam(
                const char *name,
                const char *user,
                uid_t uid,
                gid_t gid,
                const char *tty,
                char ***env,
                int fds[], size_t n_fds) {

#if HAVE_PAM

        static const struct pam_conv conv = {
                .conv = null_conv,
                .appdata_ptr = NULL
        };

        _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
        pam_handle_t *handle = NULL;
        sigset_t old_ss;
        int pam_code = PAM_SUCCESS, r;
        char **nv, **e = NULL;
        bool close_session = false;
        pid_t pam_pid = 0, parent_pid;
        int flags = 0;

        assert(name);
        assert(user);
        assert(env);

        /* We set up PAM in the parent process, then fork. The child
         * will then stay around until killed via PR_GET_PDEATHSIG or
         * systemd via the cgroup logic. It will then remove the PAM
         * session again. The parent process will exec() the actual
         * daemon. We do things this way to ensure that the main PID
         * of the daemon is the one we initially fork()ed. */

        r = barrier_create(&barrier);
        if (r < 0)
                goto fail;

        if (log_get_max_level() < LOG_DEBUG)
                flags |= PAM_SILENT;

        pam_code = pam_start(name, user, &conv, &handle);
        if (pam_code != PAM_SUCCESS) {
                handle = NULL;
                goto fail;
        }

        if (!tty) {
                _cleanup_free_ char *q = NULL;

                /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
                 * out if that's the case, and read the TTY off it. */

                if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
                        tty = strjoina("/dev/", q);
        }

        if (tty) {
                pam_code = pam_set_item(handle, PAM_TTY, tty);
                if (pam_code != PAM_SUCCESS)
                        goto fail;
        }

        STRV_FOREACH(nv, *env) {
                pam_code = pam_putenv(handle, *nv);
                if (pam_code != PAM_SUCCESS)
                        goto fail;
        }

        pam_code = pam_acct_mgmt(handle, flags);
        if (pam_code != PAM_SUCCESS)
                goto fail;

        pam_code = pam_open_session(handle, flags);
        if (pam_code != PAM_SUCCESS)
                goto fail;

        close_session = true;

        e = pam_getenvlist(handle);
        if (!e) {
                pam_code = PAM_BUF_ERR;
                goto fail;
        }

        /* Block SIGTERM, so that we know that it won't get lost in
         * the child */

        assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);

        parent_pid = getpid_cached();

        r = safe_fork("(sd-pam)", 0, &pam_pid);
        if (r < 0)
                goto fail;
        if (r == 0) {
                int sig, ret = EXIT_PAM;

                /* The child's job is to reset the PAM session on
                 * termination */
                barrier_set_role(&barrier, BARRIER_CHILD);

                /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
                 * are open here that have been opened by PAM. */
                (void) close_many(fds, n_fds);

                /* Drop privileges - we don't need any to pam_close_session
                 * and this will make PR_SET_PDEATHSIG work in most cases.
                 * If this fails, ignore the error - but expect sd-pam threads
                 * to fail to exit normally */

                r = maybe_setgroups(0, NULL);
                if (r < 0)
                        log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
                if (setresgid(gid, gid, gid) < 0)
                        log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
                if (setresuid(uid, uid, uid) < 0)
                        log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");

                (void) ignore_signals(SIGPIPE, -1);

                /* Wait until our parent died. This will only work if
                 * the above setresuid() succeeds, otherwise the kernel
                 * will not allow unprivileged parents kill their privileged
                 * children this way. We rely on the control groups kill logic
                 * to do the rest for us. */
                if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
                        goto child_finish;

                /* Tell the parent that our setup is done. This is especially
                 * important regarding dropping privileges. Otherwise, unit
                 * setup might race against our setresuid(2) call.
                 *
                 * If the parent aborted, we'll detect this below, hence ignore
                 * return failure here. */
                (void) barrier_place(&barrier);

                /* Check if our parent process might already have died? */
                if (getppid() == parent_pid) {
                        sigset_t ss;

                        assert_se(sigemptyset(&ss) >= 0);
                        assert_se(sigaddset(&ss, SIGTERM) >= 0);

                        for (;;) {
                                if (sigwait(&ss, &sig) < 0) {
                                        if (errno == EINTR)
                                                continue;

                                        goto child_finish;
                                }

                                assert(sig == SIGTERM);
                                break;
                        }
                }

                /* If our parent died we'll end the session */
                if (getppid() != parent_pid) {
                        pam_code = pam_close_session(handle, flags);
                        if (pam_code != PAM_SUCCESS)
                                goto child_finish;
                }

                ret = 0;

        child_finish:
                pam_end(handle, pam_code | flags);
                _exit(ret);
        }

        barrier_set_role(&barrier, BARRIER_PARENT);

        /* If the child was forked off successfully it will do all the
         * cleanups, so forget about the handle here. */
        handle = NULL;

        /* Unblock SIGTERM again in the parent */
        assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);

        /* We close the log explicitly here, since the PAM modules
         * might have opened it, but we don't want this fd around. */
        closelog();

        /* Synchronously wait for the child to initialize. We don't care for
         * errors as we cannot recover. However, warn loudly if it happens. */
        if (!barrier_place_and_sync(&barrier))
                log_error("PAM initialization failed");

        return strv_free_and_replace(*env, e);

fail:
        if (pam_code != PAM_SUCCESS) {
                log_error("PAM failed: %s", pam_strerror(handle, pam_code));
                r = -EPERM;  /* PAM errors do not map to errno */
        } else
                log_error_errno(r, "PAM failed: %m");

        if (handle) {
                if (close_session)
                        pam_code = pam_close_session(handle, flags);

                pam_end(handle, pam_code | flags);
        }

        strv_free(e);
        closelog();

        return r;
#else
        return 0;
#endif
}

static void rename_process_from_path(const char *path) {
        char process_name[11];
        const char *p;
        size_t l;

        /* This resulting string must fit in 10 chars (i.e. the length
         * of "/sbin/init") to look pretty in /bin/ps */

        p = basename(path);
        if (isempty(p)) {
                rename_process("(...)");
                return;
        }

        l = strlen(p);
        if (l > 8) {
                /* The end of the process name is usually more
                 * interesting, since the first bit might just be
                 * "systemd-" */
                p = p + l - 8;
                l = 8;
        }

        process_name[0] = '(';
        memcpy(process_name+1, p, l);
        process_name[1+l] = ')';
        process_name[1+l+1] = 0;

        rename_process(process_name);
}

static bool context_has_address_families(const ExecContext *c) {
        assert(c);

        return c->address_families_whitelist ||
                !set_isempty(c->address_families);
}

static bool context_has_syscall_filters(const ExecContext *c) {
        assert(c);

        return c->syscall_whitelist ||
                !hashmap_isempty(c->syscall_filter);
}

static bool context_has_no_new_privileges(const ExecContext *c) {
        assert(c);

        if (c->no_new_privileges)
                return true;

        if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
                return false;

        /* We need NNP if we have any form of seccomp and are unprivileged */
        return context_has_address_families(c) ||
                c->memory_deny_write_execute ||
                c->restrict_realtime ||
                exec_context_restrict_namespaces_set(c) ||
                c->protect_kernel_tunables ||
                c->protect_kernel_modules ||
                c->private_devices ||
                context_has_syscall_filters(c) ||
                !set_isempty(c->syscall_archs) ||
                c->lock_personality;
}

#if HAVE_SECCOMP

static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {

        if (is_seccomp_available())
                return false;

        log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
        return true;
}

static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
        uint32_t negative_action, default_action, action;
        int r;

        assert(u);
        assert(c);

        if (!context_has_syscall_filters(c))
                return 0;

        if (skip_seccomp_unavailable(u, "SystemCallFilter="))
                return 0;

        negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);

        if (c->syscall_whitelist) {
                default_action = negative_action;
                action = SCMP_ACT_ALLOW;
        } else {
                default_action = SCMP_ACT_ALLOW;
                action = negative_action;
        }

        if (needs_ambient_hack) {
                r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
                if (r < 0)
                        return r;
        }

        return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
}

static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
        assert(u);
        assert(c);

        if (set_isempty(c->syscall_archs))
                return 0;

        if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
                return 0;

        return seccomp_restrict_archs(c->syscall_archs);
}

static int apply_address_families(const Unit* u, const ExecContext *c) {
        assert(u);
        assert(c);

        if (!context_has_address_families(c))
                return 0;

        if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
                return 0;

        return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
}

static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
        assert(u);
        assert(c);

        if (!c->memory_deny_write_execute)
                return 0;

        if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
                return 0;

        return seccomp_memory_deny_write_execute();
}

static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
        assert(u);
        assert(c);

        if (!c->restrict_realtime)
                return 0;

        if (skip_seccomp_unavailable(u, "RestrictRealtime="))
                return 0;

        return seccomp_restrict_realtime();
}

static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
        assert(u);
        assert(c);

        /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
         * let's protect even those systems where this is left on in the kernel. */

        if (!c->protect_kernel_tunables)
                return 0;

        if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
                return 0;

        return seccomp_protect_sysctl();
}

static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
        assert(u);
        assert(c);

        /* Turn off module syscalls on ProtectKernelModules=yes */

        if (!c->protect_kernel_modules)
                return 0;

        if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
                return 0;

        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
}

static int apply_private_devices(const Unit *u, const ExecContext *c) {
        assert(u);
        assert(c);

        /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */

        if (!c->private_devices)
                return 0;

        if (skip_seccomp_unavailable(u, "PrivateDevices="))
                return 0;

        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
}

static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
        assert(u);
        assert(c);

        if (!exec_context_restrict_namespaces_set(c))
                return 0;

        if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
                return 0;

        return seccomp_restrict_namespaces(c->restrict_namespaces);
}

static int apply_lock_personality(const Unit* u, const ExecContext *c) {
        unsigned long personality;
        int r;

        assert(u);
        assert(c);

        if (!c->lock_personality)
                return 0;

        if (skip_seccomp_unavailable(u, "LockPersonality="))
                return 0;

        personality = c->personality;

        /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
        if (personality == PERSONALITY_INVALID) {

                r = opinionated_personality(&personality);
                if (r < 0)
                        return r;
        }

        return seccomp_lock_personality(personality);
}

#endif

static void do_idle_pipe_dance(int idle_pipe[4]) {
        assert(idle_pipe);

        idle_pipe[1] = safe_close(idle_pipe[1]);
        idle_pipe[2] = safe_close(idle_pipe[2]);

        if (idle_pipe[0] >= 0) {
                int r;

                r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);

                if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
                        ssize_t n;

                        /* Signal systemd that we are bored and want to continue. */
                        n = write(idle_pipe[3], "x", 1);
                        if (n > 0)
                                /* Wait for systemd to react to the signal above. */
                                fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
                }

                idle_pipe[0] = safe_close(idle_pipe[0]);

        }

        idle_pipe[3] = safe_close(idle_pipe[3]);
}

static const char *exec_directory_env_name_to_string(ExecDirectoryType t);

static int build_environment(
                const Unit *u,
                const ExecContext *c,
                const ExecParameters *p,
                size_t n_fds,
                const char *home,
                const char *username,
                const char *shell,
                dev_t journal_stream_dev,
                ino_t journal_stream_ino,
                char ***ret) {

        _cleanup_strv_free_ char **our_env = NULL;
        ExecDirectoryType t;
        size_t n_env = 0;
        char *x;

        assert(u);
        assert(c);
        assert(p);
        assert(ret);

        our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
        if (!our_env)
                return -ENOMEM;

        if (n_fds > 0) {
                _cleanup_free_ char *joined = NULL;

                if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
                        return -ENOMEM;
                our_env[n_env++] = x;

                if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
                        return -ENOMEM;
                our_env[n_env++] = x;

                joined = strv_join(p->fd_names, ":");
                if (!joined)
                        return -ENOMEM;

                x = strjoin("LISTEN_FDNAMES=", joined);
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
                if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
                        return -ENOMEM;
                our_env[n_env++] = x;

                if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
         * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
         * check the database directly. */
        if (p->flags & EXEC_NSS_BYPASS_BUS) {
                x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        if (home) {
                x = strappend("HOME=", home);
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        if (username) {
                x = strappend("LOGNAME=", username);
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;

                x = strappend("USER=", username);
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        if (shell) {
                x = strappend("SHELL=", shell);
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        if (!sd_id128_is_null(u->invocation_id)) {
                if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
                        return -ENOMEM;

                our_env[n_env++] = x;
        }

        if (exec_context_needs_term(c)) {
                const char *tty_path, *term = NULL;

                tty_path = exec_context_tty_path(c);

                /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
                 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
                 * passes to PID 1 ends up all the way in the console login shown. */

                if (path_equal(tty_path, "/dev/console") && getppid() == 1)
                        term = getenv("TERM");
                if (!term)
                        term = default_term_for_tty(tty_path);

                x = strappend("TERM=", term);
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        if (journal_stream_dev != 0 && journal_stream_ino != 0) {
                if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
                        return -ENOMEM;

                our_env[n_env++] = x;
        }

        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
                _cleanup_free_ char *pre = NULL, *joined = NULL;
                const char *n;

                if (!p->prefix[t])
                        continue;

                if (strv_isempty(c->directories[t].paths))
                        continue;

                n = exec_directory_env_name_to_string(t);
                if (!n)
                        continue;

                pre = strjoin(p->prefix[t], "/");
                if (!pre)
                        return -ENOMEM;

                joined = strv_join_prefix(c->directories[t].paths, ":", pre);
                if (!joined)
                        return -ENOMEM;

                x = strjoin(n, "=", joined);
                if (!x)
                        return -ENOMEM;

                our_env[n_env++] = x;
        }

        our_env[n_env++] = NULL;
        assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);

        *ret = TAKE_PTR(our_env);

        return 0;
}

static int build_pass_environment(const ExecContext *c, char ***ret) {
        _cleanup_strv_free_ char **pass_env = NULL;
        size_t n_env = 0, n_bufsize = 0;
        char **i;

        STRV_FOREACH(i, c->pass_environment) {
                _cleanup_free_ char *x = NULL;
                char *v;

                v = getenv(*i);
                if (!v)
                        continue;
                x = strjoin(*i, "=", v);
                if (!x)
                        return -ENOMEM;

                if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
                        return -ENOMEM;

                pass_env[n_env++] = TAKE_PTR(x);
                pass_env[n_env] = NULL;
        }

        *ret = TAKE_PTR(pass_env);

        return 0;
}

static bool exec_needs_mount_namespace(
                const ExecContext *context,
                const ExecParameters *params,
                const ExecRuntime *runtime) {

        assert(context);
        assert(params);

        if (context->root_image)
                return true;

        if (!strv_isempty(context->read_write_paths) ||
            !strv_isempty(context->read_only_paths) ||
            !strv_isempty(context->inaccessible_paths))
                return true;

        if (context->n_bind_mounts > 0)
                return true;

        if (context->n_temporary_filesystems > 0)
                return true;

        if (context->mount_flags != 0)
                return true;

        if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
                return true;

        if (context->private_devices ||
            context->private_mounts ||
            context->protect_system != PROTECT_SYSTEM_NO ||
            context->protect_home != PROTECT_HOME_NO ||
            context->protect_kernel_tunables ||
            context->protect_kernel_modules ||
            context->protect_control_groups)
                return true;

        if (context->root_directory) {
                ExecDirectoryType t;

                if (context->mount_apivfs)
                        return true;

                for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
                        if (!params->prefix[t])
                                continue;

                        if (!strv_isempty(context->directories[t].paths))
                                return true;
                }
        }

        if (context->dynamic_user &&
            (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
             !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
             !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
                return true;

        return false;
}

static int setup_private_users(uid_t uid, gid_t gid) {
        _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
        _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
        _cleanup_close_ int unshare_ready_fd = -1;
        _cleanup_(sigkill_waitp) pid_t pid = 0;
        uint64_t c = 1;
        ssize_t n;
        int r;

        /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
         * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
         * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
         * which waits for the parent to create the new user namespace while staying in the original namespace. The
         * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
         * continues execution normally. */

        if (uid != 0 && uid_is_valid(uid)) {
                r = asprintf(&uid_map,
                             "0 0 1\n"                      /* Map root → root */
                             UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
                             uid, uid);
                if (r < 0)
                        return -ENOMEM;
        } else {
                uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
                if (!uid_map)
                        return -ENOMEM;
        }

        if (gid != 0 && gid_is_valid(gid)) {
                r = asprintf(&gid_map,
                             "0 0 1\n"                      /* Map root → root */
                             GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
                             gid, gid);
                if (r < 0)
                        return -ENOMEM;
        } else {
                gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
                if (!gid_map)
                        return -ENOMEM;
        }

        /* Create a communication channel so that the parent can tell the child when it finished creating the user
         * namespace. */
        unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
        if (unshare_ready_fd < 0)
                return -errno;

        /* Create a communication channel so that the child can tell the parent a proper error code in case it
         * failed. */
        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
                return -errno;

        r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
        if (r < 0)
                return r;
        if (r == 0) {
                _cleanup_close_ int fd = -1;
                const char *a;
                pid_t ppid;

                /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
                 * here, after the parent opened its own user namespace. */

                ppid = getppid();
                errno_pipe[0] = safe_close(errno_pipe[0]);

                /* Wait until the parent unshared the user namespace */
                if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
                        r = -errno;
                        goto child_fail;
                }

                /* Disable the setgroups() system call in the child user namespace, for good. */
                a = procfs_file_alloca(ppid, "setgroups");
                fd = open(a, O_WRONLY|O_CLOEXEC);
                if (fd < 0) {
                        if (errno != ENOENT) {
                                r = -errno;
                                goto child_fail;
                        }

                        /* If the file is missing the kernel is too old, let's continue anyway. */
                } else {
                        if (write(fd, "deny\n", 5) < 0) {
                                r = -errno;
                                goto child_fail;
                        }

                        fd = safe_close(fd);
                }

                /* First write the GID map */
                a = procfs_file_alloca(ppid, "gid_map");
                fd = open(a, O_WRONLY|O_CLOEXEC);
                if (fd < 0) {
                        r = -errno;
                        goto child_fail;
                }
                if (write(fd, gid_map, strlen(gid_map)) < 0) {
                        r = -errno;
                        goto child_fail;
                }
                fd = safe_close(fd);

                /* The write the UID map */
                a = procfs_file_alloca(ppid, "uid_map");
                fd = open(a, O_WRONLY|O_CLOEXEC);
                if (fd < 0) {
                        r = -errno;
                        goto child_fail;
                }
                if (write(fd, uid_map, strlen(uid_map)) < 0) {
                        r = -errno;
                        goto child_fail;
                }

                _exit(EXIT_SUCCESS);

        child_fail:
                (void) write(errno_pipe[1], &r, sizeof(r));
                _exit(EXIT_FAILURE);
        }

        errno_pipe[1] = safe_close(errno_pipe[1]);

        if (unshare(CLONE_NEWUSER) < 0)
                return -errno;

        /* Let the child know that the namespace is ready now */
        if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
                return -errno;

        /* Try to read an error code from the child */
        n = read(errno_pipe[0], &r, sizeof(r));
        if (n < 0)
                return -errno;
        if (n == sizeof(r)) { /* an error code was sent to us */
                if (r < 0)
                        return r;
                return -EIO;
        }
        if (n != 0) /* on success we should have read 0 bytes */
                return -EIO;

        r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
        pid = 0;
        if (r < 0)
                return r;
        if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
                return -EIO;

        return 0;
}

static int setup_exec_directory(
                const ExecContext *context,
                const ExecParameters *params,
                uid_t uid,
                gid_t gid,
                ExecDirectoryType type,
                int *exit_status) {

        static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
                [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
                [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
                [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
                [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
                [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
        };
        char **rt;
        int r;

        assert(context);
        assert(params);
        assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
        assert(exit_status);

        if (!params->prefix[type])
                return 0;

        if (params->flags & EXEC_CHOWN_DIRECTORIES) {
                if (!uid_is_valid(uid))
                        uid = 0;
                if (!gid_is_valid(gid))
                        gid = 0;
        }

        STRV_FOREACH(rt, context->directories[type].paths) {
                _cleanup_free_ char *p = NULL, *pp = NULL;

                p = strjoin(params->prefix[type], "/", *rt);
                if (!p) {
                        r = -ENOMEM;
                        goto fail;
                }

                r = mkdir_parents_label(p, 0755);
                if (r < 0)
                        goto fail;

                if (context->dynamic_user &&
                    !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
                        _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;

                        /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
                         * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
                         * whose UID is later on reused. To lock this down we use the same trick used by container
                         * managers to prohibit host users to get access to files of the same UID in containers: we
                         * place everything inside a directory that has an access mode of 0700 and is owned root:root,
                         * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
                         * to make this directory permeable for the service itself.
                         *
                         * Specifically: for a service which wants a special directory "foo/" we first create a
                         * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
                         * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
                         * privileged host users can access "foo/" as usual, but unprivileged host users can't look
                         * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
                         * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
                         * disabling the access boundary for the service and making sure it only gets access to the
                         * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
                         *
                         * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
                         * owned by the service itself.
                         * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
                         * files or sockets with other services. */

                        private_root = strjoin(params->prefix[type], "/private");
                        if (!private_root) {
                                r = -ENOMEM;
                                goto fail;
                        }

                        /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
                        r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
                        if (r < 0)
                                goto fail;

                        pp = strjoin(private_root, "/", *rt);
                        if (!pp) {
                                r = -ENOMEM;
                                goto fail;
                        }

                        /* Create all directories between the configured directory and this private root, and mark them 0755 */
                        r = mkdir_parents_label(pp, 0755);
                        if (r < 0)
                                goto fail;

                        if (is_dir(p, false) > 0 &&
                            (laccess(pp, F_OK) < 0 && errno == ENOENT)) {

                                /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
                                 * it over. Most likely the service has been upgraded from one that didn't use
                                 * DynamicUser=1, to one that does. */

                                if (rename(p, pp) < 0) {
                                        r = -errno;
                                        goto fail;
                                }
                        } else {
                                /* Otherwise, create the actual directory for the service */

                                r = mkdir_label(pp, context->directories[type].mode);
                                if (r < 0 && r != -EEXIST)
                                        goto fail;
                        }

                        parent = dirname_malloc(p);
                        if (!parent) {
                                r = -ENOMEM;
                                goto fail;
                        }

                        r = path_make_relative(parent, pp, &relative);
                        if (r < 0)
                                goto fail;

                        /* And link it up from the original place */
                        r = symlink_idempotent(relative, p);
                        if (r < 0)
                                goto fail;

                        /* Lock down the access mode */
                        if (chmod(pp, context->directories[type].mode) < 0) {
                                r = -errno;
                                goto fail;
                        }
                } else {
                        r = mkdir_label(p, context->directories[type].mode);
                        if (r < 0 && r != -EEXIST)
                                goto fail;
                        if (r == -EEXIST && !context->dynamic_user)
                                continue;
                }

                /* Don't change the owner of the configuration directory, as in the common case it is not written to by
                 * a service, and shall not be writable. */
                if (type == EXEC_DIRECTORY_CONFIGURATION)
                        continue;

                /* Then, change the ownership of the whole tree, if necessary */
                r = path_chown_recursive(pp ?: p, uid, gid);
                if (r < 0)
                        goto fail;
        }

        return 0;

fail:
        *exit_status = exit_status_table[type];
        return r;
}

#if ENABLE_SMACK
static int setup_smack(
                const ExecContext *context,
                const ExecCommand *command) {

        int r;

        assert(context);
        assert(command);

        if (context->smack_process_label) {
                r = mac_smack_apply_pid(0, context->smack_process_label);
                if (r < 0)
                        return r;
        }
#ifdef SMACK_DEFAULT_PROCESS_LABEL
        else {
                _cleanup_free_ char *exec_label = NULL;

                r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
                if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
                        return r;

                r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
                if (r < 0)
                        return r;
        }
#endif

        return 0;
}
#endif

static int compile_bind_mounts(
                const ExecContext *context,
                const ExecParameters *params,
                BindMount **ret_bind_mounts,
                size_t *ret_n_bind_mounts,
                char ***ret_empty_directories) {

        _cleanup_strv_free_ char **empty_directories = NULL;
        BindMount *bind_mounts;
        size_t n, h = 0, i;
        ExecDirectoryType t;
        int r;

        assert(context);
        assert(params);
        assert(ret_bind_mounts);
        assert(ret_n_bind_mounts);
        assert(ret_empty_directories);

        n = context->n_bind_mounts;
        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
                if (!params->prefix[t])
                        continue;

                n += strv_length(context->directories[t].paths);
        }

        if (n <= 0) {
                *ret_bind_mounts = NULL;
                *ret_n_bind_mounts = 0;
                *ret_empty_directories = NULL;
                return 0;
        }

        bind_mounts = new(BindMount, n);
        if (!bind_mounts)
                return -ENOMEM;

        for (i = 0; i < context->n_bind_mounts; i++) {
                BindMount *item = context->bind_mounts + i;
                char *s, *d;

                s = strdup(item->source);
                if (!s) {
                        r = -ENOMEM;
                        goto finish;
                }

                d = strdup(item->destination);
                if (!d) {
                        free(s);
                        r = -ENOMEM;
                        goto finish;
                }

                bind_mounts[h++] = (BindMount) {
                        .source = s,
                        .destination = d,
                        .read_only = item->read_only,
                        .recursive = item->recursive,
                        .ignore_enoent = item->ignore_enoent,
                };
        }

        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
                char **suffix;

                if (!params->prefix[t])
                        continue;

                if (strv_isempty(context->directories[t].paths))
                        continue;

                if (context->dynamic_user &&
                    !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
                    !(context->root_directory || context->root_image)) {
                        char *private_root;

                        /* So this is for a dynamic user, and we need to make sure the process can access its own
                         * directory. For that we overmount the usually inaccessible "private" subdirectory with a
                         * tmpfs that makes it accessible and is empty except for the submounts we do this for. */

                        private_root = strjoin(params->prefix[t], "/private");
                        if (!private_root) {
                                r = -ENOMEM;
                                goto finish;
                        }

                        r = strv_consume(&empty_directories, private_root);
                        if (r < 0)
                                goto finish;
                }

                STRV_FOREACH(suffix, context->directories[t].paths) {
                        char *s, *d;

                        if (context->dynamic_user &&
                            !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
                                s = strjoin(params->prefix[t], "/private/", *suffix);
                        else
                                s = strjoin(params->prefix[t], "/", *suffix);
                        if (!s) {
                                r = -ENOMEM;
                                goto finish;
                        }

                        if (context->dynamic_user &&
                            !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
                            (context->root_directory || context->root_image))
                                /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
                                 * directory is not created on the root directory. So, let's bind-mount the directory
                                 * on the 'non-private' place. */
                                d = strjoin(params->prefix[t], "/", *suffix);
                        else
                                d = strdup(s);
                        if (!d) {
                                free(s);
                                r = -ENOMEM;
                                goto finish;
                        }

                        bind_mounts[h++] = (BindMount) {
                                .source = s,
                                .destination = d,
                                .read_only = false,
                                .recursive = true,
                                .ignore_enoent = false,
                        };
                }
        }

        assert(h == n);

        *ret_bind_mounts = bind_mounts;
        *ret_n_bind_mounts = n;
        *ret_empty_directories = TAKE_PTR(empty_directories);

        return (int) n;

finish:
        bind_mount_free_many(bind_mounts, h);
        return r;
}

static int apply_mount_namespace(
                const Unit *u,
                const ExecCommand *command,
                const ExecContext *context,
                const ExecParameters *params,
                const ExecRuntime *runtime) {

        _cleanup_strv_free_ char **empty_directories = NULL;
        char *tmp = NULL, *var = NULL;
        const char *root_dir = NULL, *root_image = NULL;
        NamespaceInfo ns_info;
        bool needs_sandboxing;
        BindMount *bind_mounts = NULL;
        size_t n_bind_mounts = 0;
        int r;

        assert(context);

        /* The runtime struct only contains the parent of the private /tmp,
         * which is non-accessible to world users. Inside of it there's a /tmp
         * that is sticky, and that's the one we want to use here. */

        if (context->private_tmp && runtime) {
                if (runtime->tmp_dir)
                        tmp = strjoina(runtime->tmp_dir, "/tmp");
                if (runtime->var_tmp_dir)
                        var = strjoina(runtime->var_tmp_dir, "/tmp");
        }

        if (params->flags & EXEC_APPLY_CHROOT) {
                root_image = context->root_image;

                if (!root_image)
                        root_dir = context->root_directory;
        }

        r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
        if (r < 0)
                return r;

        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
        if (needs_sandboxing)
                ns_info = (NamespaceInfo) {
                        .ignore_protect_paths = false,
                        .private_dev = context->private_devices,
                        .protect_control_groups = context->protect_control_groups,
                        .protect_kernel_tunables = context->protect_kernel_tunables,
                        .protect_kernel_modules = context->protect_kernel_modules,
                        .mount_apivfs = context->mount_apivfs,
                        .private_mounts = context->private_mounts,
                };
        else if (!context->dynamic_user && root_dir)
                /*
                 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
                 * sandbox info, otherwise enforce it, don't ignore protected paths and
                 * fail if we are enable to apply the sandbox inside the mount namespace.
                 */
                ns_info = (NamespaceInfo) {
                        .ignore_protect_paths = true,
                };
        else
                ns_info = (NamespaceInfo) {};

        r = setup_namespace(root_dir, root_image,
                            &ns_info, context->read_write_paths,
                            needs_sandboxing ? context->read_only_paths : NULL,
                            needs_sandboxing ? context->inaccessible_paths : NULL,
                            empty_directories,
                            bind_mounts,
                            n_bind_mounts,
                            context->temporary_filesystems,
                            context->n_temporary_filesystems,
                            tmp,
                            var,
                            needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
                            needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
                            context->mount_flags,
                            DISSECT_IMAGE_DISCARD_ON_LOOP);

        bind_mount_free_many(bind_mounts, n_bind_mounts);

        /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
         * that with a special, recognizable error ENOANO. In this case, silently proceeed, but only if exclusively
         * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
         * completely different execution environment. */
        if (r == -ENOANO &&
            n_bind_mounts == 0 && context->n_temporary_filesystems == 0 &&
            !root_dir && !root_image &&
            !context->dynamic_user) {
                log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
                return 0;
        }

        return r;
}

static int apply_working_directory(
                const ExecContext *context,
                const ExecParameters *params,
                const char *home,
                const bool needs_mount_ns,
                int *exit_status) {

        const char *d, *wd;

        assert(context);
        assert(exit_status);

        if (context->working_directory_home) {

                if (!home) {
                        *exit_status = EXIT_CHDIR;
                        return -ENXIO;
                }

                wd = home;

        } else if (context->working_directory)
                wd = context->working_directory;
        else
                wd = "/";

        if (params->flags & EXEC_APPLY_CHROOT) {
                if (!needs_mount_ns && context->root_directory)
                        if (chroot(context->root_directory) < 0) {
                                *exit_status = EXIT_CHROOT;
                                return -errno;
                        }

                d = wd;
        } else
                d = prefix_roota(context->root_directory, wd);

        if (chdir(d) < 0 && !context->working_directory_missing_ok) {
                *exit_status = EXIT_CHDIR;
                return -errno;
        }

        return 0;
}

static int setup_keyring(
                const Unit *u,
                const ExecContext *context,
                const ExecParameters *p,
                uid_t uid, gid_t gid) {

        key_serial_t keyring;
        int r = 0;
        uid_t saved_uid;
        gid_t saved_gid;

        assert(u);
        assert(context);
        assert(p);

        /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
         * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
         * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
         * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
         * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
         * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */

        if (!(p->flags & EXEC_NEW_KEYRING))
                return 0;

        if (context->keyring_mode == EXEC_KEYRING_INHERIT)
                return 0;

        /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
         * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
         * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
         * & group is just as nasty as acquiring a reference to the user keyring. */

        saved_uid = getuid();
        saved_gid = getgid();

        if (gid_is_valid(gid) && gid != saved_gid) {
                if (setregid(gid, -1) < 0)
                        return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
        }

        if (uid_is_valid(uid) && uid != saved_uid) {
                if (setreuid(uid, -1) < 0) {
                        r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
                        goto out;
                }
        }

        keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
        if (keyring == -1) {
                if (errno == ENOSYS)
                        log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
                else if (IN_SET(errno, EACCES, EPERM))
                        log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
                else if (errno == EDQUOT)
                        log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
                else
                        r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");

                goto out;
        }

        /* When requested link the user keyring into the session keyring. */
        if (context->keyring_mode == EXEC_KEYRING_SHARED) {

                if (keyctl(KEYCTL_LINK,
                           KEY_SPEC_USER_KEYRING,
                           KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
                        r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
                        goto out;
                }
        }

        /* Restore uid/gid back */
        if (uid_is_valid(uid) && uid != saved_uid) {
                if (setreuid(saved_uid, -1) < 0) {
                        r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
                        goto out;
                }
        }

        if (gid_is_valid(gid) && gid != saved_gid) {
                if (setregid(saved_gid, -1) < 0)
                        return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
        }

        /* Populate they keyring with the invocation ID by default, as original saved_uid. */
        if (!sd_id128_is_null(u->invocation_id)) {
                key_serial_t key;

                key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
                if (key == -1)
                        log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
                else {
                        if (keyctl(KEYCTL_SETPERM, key,
                                   KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
                                   KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
                                r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
                }
        }

out:
        /* Revert back uid & gid for the the last time, and exit */
        /* no extra logging, as only the first already reported error matters */
        if (getuid() != saved_uid)
                (void) setreuid(saved_uid, -1);

        if (getgid() != saved_gid)
                (void) setregid(saved_gid, -1);

        return r;
}

static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
        assert(array);
        assert(n);

        if (!pair)
                return;

        if (pair[0] >= 0)
                array[(*n)++] = pair[0];
        if (pair[1] >= 0)
                array[(*n)++] = pair[1];
}

static int close_remaining_fds(
                const ExecParameters *params,
                const ExecRuntime *runtime,
                const DynamicCreds *dcreds,
                int user_lookup_fd,
                int socket_fd,
                int exec_fd,
                int *fds, size_t n_fds) {

        size_t n_dont_close = 0;
        int dont_close[n_fds + 12];

        assert(params);

        if (params->stdin_fd >= 0)
                dont_close[n_dont_close++] = params->stdin_fd;
        if (params->stdout_fd >= 0)
                dont_close[n_dont_close++] = params->stdout_fd;
        if (params->stderr_fd >= 0)
                dont_close[n_dont_close++] = params->stderr_fd;

        if (socket_fd >= 0)
                dont_close[n_dont_close++] = socket_fd;
        if (exec_fd >= 0)
                dont_close[n_dont_close++] = exec_fd;
        if (n_fds > 0) {
                memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
                n_dont_close += n_fds;
        }

        if (runtime)
                append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);

        if (dcreds) {
                if (dcreds->user)
                        append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
                if (dcreds->group)
                        append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
        }

        if (user_lookup_fd >= 0)
                dont_close[n_dont_close++] = user_lookup_fd;

        return close_all_fds(dont_close, n_dont_close);
}

static int send_user_lookup(
                Unit *unit,
                int user_lookup_fd,
                uid_t uid,
                gid_t gid) {

        assert(unit);

        /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
         * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
         * specified. */

        if (user_lookup_fd < 0)
                return 0;

        if (!uid_is_valid(uid) && !gid_is_valid(gid))
                return 0;

        if (writev(user_lookup_fd,
               (struct iovec[]) {
                           IOVEC_INIT(&uid, sizeof(uid)),
                           IOVEC_INIT(&gid, sizeof(gid)),
                           IOVEC_INIT_STRING(unit->id) }, 3) < 0)
                return -errno;

        return 0;
}

static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
        int r;

        assert(c);
        assert(home);
        assert(buf);

        /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */

        if (*home)
                return 0;

        if (!c->working_directory_home)
                return 0;

        if (uid == 0) {
                /* Hardcode /root as home directory for UID 0 */
                *home = "/root";
                return 1;
        }

        r = get_home_dir(buf);
        if (r < 0)
                return r;

        *home = *buf;
        return 1;
}

static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
        _cleanup_strv_free_ char ** list = NULL;
        ExecDirectoryType t;
        int r;

        assert(c);
        assert(p);
        assert(ret);

        assert(c->dynamic_user);

        /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
         * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
         * directories. */

        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
                char **i;

                if (t == EXEC_DIRECTORY_CONFIGURATION)
                        continue;

                if (!p->prefix[t])
                        continue;

                STRV_FOREACH(i, c->directories[t].paths) {
                        char *e;

                        if (t == EXEC_DIRECTORY_RUNTIME)
                                e = strjoin(p->prefix[t], "/", *i);
                        else
                                e = strjoin(p->prefix[t], "/private/", *i);
                        if (!e)
                                return -ENOMEM;

                        r = strv_consume(&list, e);
                        if (r < 0)
                                return r;
                }
        }

        *ret = TAKE_PTR(list);

        return 0;
}

static char *exec_command_line(char **argv);

static int exec_child(
                Unit *unit,
                const ExecCommand *command,
                const ExecContext *context,
                const ExecParameters *params,
                ExecRuntime *runtime,
                DynamicCreds *dcreds,
                int socket_fd,
                int named_iofds[3],
                int *fds,
                size_t n_socket_fds,
                size_t n_storage_fds,
                char **files_env,
                int user_lookup_fd,
                int *exit_status) {

        _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
        int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
        _cleanup_free_ gid_t *supplementary_gids = NULL;
        const char *username = NULL, *groupname = NULL;
        _cleanup_free_ char *home_buffer = NULL;
        const char *home = NULL, *shell = NULL;
        dev_t journal_stream_dev = 0;
        ino_t journal_stream_ino = 0;
        bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
                needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
                needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
                needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
#if HAVE_SELINUX
        _cleanup_free_ char *mac_selinux_context_net = NULL;
        bool use_selinux = false;
#endif
#if ENABLE_SMACK
        bool use_smack = false;
#endif
#if HAVE_APPARMOR
        bool use_apparmor = false;
#endif
        uid_t uid = UID_INVALID;
        gid_t gid = GID_INVALID;
        size_t n_fds;
        ExecDirectoryType dt;
        int secure_bits;

        assert(unit);
        assert(command);
        assert(context);
        assert(params);
        assert(exit_status);

        rename_process_from_path(command->path);

        /* We reset exactly these signals, since they are the
         * only ones we set to SIG_IGN in the main daemon. All
         * others we leave untouched because we set them to
         * SIG_DFL or a valid handler initially, both of which
         * will be demoted to SIG_DFL. */
        (void) default_signals(SIGNALS_CRASH_HANDLER,
                               SIGNALS_IGNORE, -1);

        if (context->ignore_sigpipe)
                (void) ignore_signals(SIGPIPE, -1);

        r = reset_signal_mask();
        if (r < 0) {
                *exit_status = EXIT_SIGNAL_MASK;
                return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
        }

        if (params->idle_pipe)
                do_idle_pipe_dance(params->idle_pipe);

        /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
         * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
         * any fds open we don't really want open during the transition. In order to make logging work, we switch the
         * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */

        log_forget_fds();
        log_set_open_when_needed(true);

        /* In case anything used libc syslog(), close this here, too */
        closelog();

        n_fds = n_socket_fds + n_storage_fds;
        r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
        if (r < 0) {
                *exit_status = EXIT_FDS;
                return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
        }

        if (!context->same_pgrp)
                if (setsid() < 0) {
                        *exit_status = EXIT_SETSID;
                        return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
                }

        exec_context_tty_reset(context, params);

        if (unit_shall_confirm_spawn(unit)) {
                const char *vc = params->confirm_spawn;
                _cleanup_free_ char *cmdline = NULL;

                cmdline = exec_command_line(command->argv);
                if (!cmdline) {
                        *exit_status = EXIT_MEMORY;
                        return log_oom();
                }

                r = ask_for_confirmation(vc, unit, cmdline);
                if (r != CONFIRM_EXECUTE) {
                        if (r == CONFIRM_PRETEND_SUCCESS) {
                                *exit_status = EXIT_SUCCESS;
                                return 0;
                        }
                        *exit_status = EXIT_CONFIRM;
                        log_unit_error(unit, "Execution cancelled by the user");
                        return -ECANCELED;
                }
        }

        /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
         * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
         * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
         * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
         * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
        if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
            setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
                *exit_status = EXIT_MEMORY;
                return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
        }

        if (context->dynamic_user && dcreds) {
                _cleanup_strv_free_ char **suggested_paths = NULL;

                /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
                 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
                if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
                        *exit_status = EXIT_USER;
                        return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
                }

                r = compile_suggested_paths(context, params, &suggested_paths);
                if (r < 0) {
                        *exit_status = EXIT_MEMORY;
                        return log_oom();
                }

                r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
                if (r < 0) {
                        *exit_status = EXIT_USER;
                        if (r == -EILSEQ) {
                                log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
                                return -EOPNOTSUPP;
                        }
                        return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
                }

                if (!uid_is_valid(uid)) {
                        *exit_status = EXIT_USER;
                        log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
                        return -ESRCH;
                }

                if (!gid_is_valid(gid)) {
                        *exit_status = EXIT_USER;
                        log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
                        return -ESRCH;
                }

                if (dcreds->user)
                        username = dcreds->user->name;

        } else {
                r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
                if (r < 0) {
                        *exit_status = EXIT_USER;
                        return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
                }

                r = get_fixed_group(context, &groupname, &gid);
                if (r < 0) {
                        *exit_status = EXIT_GROUP;
                        return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
                }
        }

        /* Initialize user supplementary groups and get SupplementaryGroups= ones */
        r = get_supplementary_groups(context, username, groupname, gid,
                                     &supplementary_gids, &ngids);
        if (r < 0) {
                *exit_status = EXIT_GROUP;
                return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
        }

        r = send_user_lookup(unit, user_lookup_fd, uid, gid);
        if (r < 0) {
                *exit_status = EXIT_USER;
                return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
        }

        user_lookup_fd = safe_close(user_lookup_fd);

        r = acquire_home(context, uid, &home, &home_buffer);
        if (r < 0) {
                *exit_status = EXIT_CHDIR;
                return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
        }

        /* If a socket is connected to STDIN/STDOUT/STDERR, we
         * must sure to drop O_NONBLOCK */
        if (socket_fd >= 0)
                (void) fd_nonblock(socket_fd, false);

        r = setup_input(context, params, socket_fd, named_iofds);
        if (r < 0) {
                *exit_status = EXIT_STDIN;
                return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
        }

        r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
        if (r < 0) {
                *exit_status = EXIT_STDOUT;
                return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
        }

        r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
        if (r < 0) {
                *exit_status = EXIT_STDERR;
                return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
        }

        if (params->cgroup_path) {
                r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
                if (r < 0) {
                        *exit_status = EXIT_CGROUP;
                        return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
                }
        }

        if (context->oom_score_adjust_set) {
                /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
                 * prohibit write access to this file, and we shouldn't trip up over that. */
                r = set_oom_score_adjust(context->oom_score_adjust);
                if (IN_SET(r, -EPERM, -EACCES))
                        log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
                else if (r < 0) {
                        *exit_status = EXIT_OOM_ADJUST;
                        return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
                }
        }

        if (context->nice_set)
                if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
                        *exit_status = EXIT_NICE;
                        return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
                }

        if (context->cpu_sched_set) {
                struct sched_param param = {
                        .sched_priority = context->cpu_sched_priority,
                };

                r = sched_setscheduler(0,
                                       context->cpu_sched_policy |
                                       (context->cpu_sched_reset_on_fork ?
                                        SCHED_RESET_ON_FORK : 0),
                                       &param);
                if (r < 0) {
                        *exit_status = EXIT_SETSCHEDULER;
                        return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
                }
        }

        if (context->cpuset)
                if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
                        *exit_status = EXIT_CPUAFFINITY;
                        return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
                }

        if (context->ioprio_set)
                if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
                        *exit_status = EXIT_IOPRIO;
                        return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
                }

        if (context->timer_slack_nsec != NSEC_INFINITY)
                if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
                        *exit_status = EXIT_TIMERSLACK;
                        return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
                }

        if (context->personality != PERSONALITY_INVALID) {
                r = safe_personality(context->personality);
                if (r < 0) {
                        *exit_status = EXIT_PERSONALITY;
                        return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
                }
        }

        if (context->utmp_id)
                utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
                                      context->tty_path,
                                      context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
                                      context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
                                      USER_PROCESS,
                                      username);

        if (context->user) {
                r = chown_terminal(STDIN_FILENO, uid);
                if (r < 0) {
                        *exit_status = EXIT_STDIN;
                        return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
                }
        }

        /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
         * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
         * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
         * touch a single hierarchy too. */
        if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
                r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
                if (r < 0) {
                        *exit_status = EXIT_CGROUP;
                        return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
                }
        }

        for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
                r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
                if (r < 0)
                        return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
        }

        r = build_environment(
                        unit,
                        context,
                        params,
                        n_fds,
                        home,
                        username,
                        shell,
                        journal_stream_dev,
                        journal_stream_ino,
                        &our_env);
        if (r < 0) {
                *exit_status = EXIT_MEMORY;
                return log_oom();
        }

        r = build_pass_environment(context, &pass_env);
        if (r < 0) {
                *exit_status = EXIT_MEMORY;
                return log_oom();
        }

        accum_env = strv_env_merge(5,
                                   params->environment,
                                   our_env,
                                   pass_env,
                                   context->environment,
                                   files_env,
                                   NULL);
        if (!accum_env) {
                *exit_status = EXIT_MEMORY;
                return log_oom();
        }
        accum_env = strv_env_clean(accum_env);

        (void) umask(context->umask);

        r = setup_keyring(unit, context, params, uid, gid);
        if (r < 0) {
                *exit_status = EXIT_KEYRING;
                return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
        }

        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);

        /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
        needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();

        /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
        if (needs_ambient_hack)
                needs_setuid = false;
        else
                needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));

        if (needs_sandboxing) {
                /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
                 * present. The actual MAC context application will happen later, as late as possible, to avoid
                 * impacting our own code paths. */

#if HAVE_SELINUX
                use_selinux = mac_selinux_use();
#endif
#if ENABLE_SMACK
                use_smack = mac_smack_use();
#endif
#if HAVE_APPARMOR
                use_apparmor = mac_apparmor_use();
#endif
        }

        if (needs_setuid) {
                if (context->pam_name && username) {
                        r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
                        if (r < 0) {
                                *exit_status = EXIT_PAM;
                                return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
                        }
                }
        }

        if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
                if (ns_type_supported(NAMESPACE_NET)) {
                        r = setup_netns(runtime->netns_storage_socket);
                        if (r < 0) {
                                *exit_status = EXIT_NETWORK;
                                return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
                        }
                } else
                        log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
        }

        needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
        if (needs_mount_namespace) {
                r = apply_mount_namespace(unit, command, context, params, runtime);
                if (r < 0) {
                        *exit_status = EXIT_NAMESPACE;
                        return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
                }
        }

        /* Apply just after mount namespace setup */
        r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
        if (r < 0)
                return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");

        /* Drop groups as early as possbile */
        if (needs_setuid) {
                r = enforce_groups(gid, supplementary_gids, ngids);
                if (r < 0) {
                        *exit_status = EXIT_GROUP;
                        return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
                }
        }

        if (needs_sandboxing) {
#if HAVE_SELINUX
                if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
                        r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
                        if (r < 0) {
                                *exit_status = EXIT_SELINUX_CONTEXT;
                                return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
                        }
                }
#endif

                if (context->private_users) {
                        r = setup_private_users(uid, gid);
                        if (r < 0) {
                                *exit_status = EXIT_USER;
                                return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
                        }
                }
        }

        /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
         * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
         * however if we have it as we want to keep it open until the final execve(). */

        if (params->exec_fd >= 0) {
                exec_fd = params->exec_fd;

                if (exec_fd < 3 + (int) n_fds) {
                        int moved_fd;

                        /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
                         * process we are about to execute. */

                        moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
                        if (moved_fd < 0) {
                                *exit_status = EXIT_FDS;
                                return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
                        }

                        safe_close(exec_fd);
                        exec_fd = moved_fd;
                } else {
                        /* This fd should be FD_CLOEXEC already, but let's make sure. */
                        r = fd_cloexec(exec_fd, true);
                        if (r < 0) {
                                *exit_status = EXIT_FDS;
                                return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
                        }
                }

                fds_with_exec_fd = newa(int, n_fds + 1);
                memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
                fds_with_exec_fd[n_fds] = exec_fd;
                n_fds_with_exec_fd = n_fds + 1;
        } else {
                fds_with_exec_fd = fds;
                n_fds_with_exec_fd = n_fds;
        }

        r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
        if (r >= 0)
                r = shift_fds(fds, n_fds);
        if (r >= 0)
                r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
        if (r < 0) {
                *exit_status = EXIT_FDS;
                return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
        }

        /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
         * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
         * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
         * came this far. */

        secure_bits = context->secure_bits;

        if (needs_sandboxing) {
                uint64_t bset;
                int which_failed;

                r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
                if (r < 0) {
                        *exit_status = EXIT_LIMITS;
                        return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
                }

                /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
                if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
                        if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
                                *exit_status = EXIT_LIMITS;
                                return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
                        }
                }

#if ENABLE_SMACK
                /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
                 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
                if (use_smack) {
                        r = setup_smack(context, command);
                        if (r < 0) {
                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
                                return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
                        }
                }
#endif

                bset = context->capability_bounding_set;
                /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
                 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
                 * instead of us doing that */
                if (needs_ambient_hack)
                        bset |= (UINT64_C(1) << CAP_SETPCAP) |
                                (UINT64_C(1) << CAP_SETUID) |
                                (UINT64_C(1) << CAP_SETGID);

                if (!cap_test_all(bset)) {
                        r = capability_bounding_set_drop(bset, false);
                        if (r < 0) {
                                *exit_status = EXIT_CAPABILITIES;
                                return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
                        }
                }

                /* This is done before enforce_user, but ambient set
                 * does not survive over setresuid() if keep_caps is not set. */
                if (!needs_ambient_hack &&
                    context->capability_ambient_set != 0) {
                        r = capability_ambient_set_apply(context->capability_ambient_set, true);
                        if (r < 0) {
                                *exit_status = EXIT_CAPABILITIES;
                                return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
                        }
                }
        }

        if (needs_setuid) {
                if (context->user) {
                        r = enforce_user(context, uid);
                        if (r < 0) {
                                *exit_status = EXIT_USER;
                                return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
                        }

                        if (!needs_ambient_hack &&
                            context->capability_ambient_set != 0) {

                                /* Fix the ambient capabilities after user change. */
                                r = capability_ambient_set_apply(context->capability_ambient_set, false);
                                if (r < 0) {
                                        *exit_status = EXIT_CAPABILITIES;
                                        return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
                                }

                                /* If we were asked to change user and ambient capabilities
                                 * were requested, we had to add keep-caps to the securebits
                                 * so that we would maintain the inherited capability set
                                 * through the setresuid(). Make sure that the bit is added
                                 * also to the context secure_bits so that we don't try to
                                 * drop the bit away next. */

                                secure_bits |= 1<<SECURE_KEEP_CAPS;
                        }
                }
        }

        if (needs_sandboxing) {
                /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
                 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
                 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
                 * are restricted. */

#if HAVE_SELINUX
                if (use_selinux) {
                        char *exec_context = mac_selinux_context_net ?: context->selinux_context;

                        if (exec_context) {
                                r = setexeccon(exec_context);
                                if (r < 0) {
                                        *exit_status = EXIT_SELINUX_CONTEXT;
                                        return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
                                }
                        }
                }
#endif

#if HAVE_APPARMOR
                if (use_apparmor && context->apparmor_profile) {
                        r = aa_change_onexec(context->apparmor_profile);
                        if (r < 0 && !context->apparmor_profile_ignore) {
                                *exit_status = EXIT_APPARMOR_PROFILE;
                                return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
                        }
                }
#endif

                /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
                 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
                if (prctl(PR_GET_SECUREBITS) != secure_bits)
                        if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
                                *exit_status = EXIT_SECUREBITS;
                                return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
                        }

                if (context_has_no_new_privileges(context))
                        if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
                                *exit_status = EXIT_NO_NEW_PRIVILEGES;
                                return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
                        }

#if HAVE_SECCOMP
                r = apply_address_families(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_ADDRESS_FAMILIES;
                        return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
                }

                r = apply_memory_deny_write_execute(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
                }

                r = apply_restrict_realtime(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
                }

                r = apply_restrict_namespaces(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
                }

                r = apply_protect_sysctl(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
                }

                r = apply_protect_kernel_modules(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
                }

                r = apply_private_devices(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
                }

                r = apply_syscall_archs(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
                }

                r = apply_lock_personality(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
                }

                /* This really should remain the last step before the execve(), to make sure our own code is unaffected
                 * by the filter as little as possible. */
                r = apply_syscall_filter(unit, context, needs_ambient_hack);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
                }
#endif
        }

        if (!strv_isempty(context->unset_environment)) {
                char **ee = NULL;

                ee = strv_env_delete(accum_env, 1, context->unset_environment);
                if (!ee) {
                        *exit_status = EXIT_MEMORY;
                        return log_oom();
                }

                strv_free_and_replace(accum_env, ee);
        }

        final_argv = replace_env_argv(command->argv, accum_env);
        if (!final_argv) {
                *exit_status = EXIT_MEMORY;
                return log_oom();
        }

        if (DEBUG_LOGGING) {
                _cleanup_free_ char *line;

                line = exec_command_line(final_argv);
                if (line)
                        log_struct(LOG_DEBUG,
                                   "EXECUTABLE=%s", command->path,
                                   LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
                                   LOG_UNIT_ID(unit),
                                   LOG_UNIT_INVOCATION_ID(unit));
        }

        if (exec_fd >= 0) {
                uint8_t hot = 1;

                /* We have finished with all our initializations. Let's now let the manager know that. From this point
                 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */

                if (write(exec_fd, &hot, sizeof(hot)) < 0) {
                        *exit_status = EXIT_EXEC;
                        return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
                }
        }

        execve(command->path, final_argv, accum_env);
        r = -errno;

        if (exec_fd >= 0) {
                uint8_t hot = 0;

                /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
                 * that POLLHUP on it no longer means execve() succeeded. */

                if (write(exec_fd, &hot, sizeof(hot)) < 0) {
                        *exit_status = EXIT_EXEC;
                        return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
                }
        }

        if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
                log_struct_errno(LOG_INFO, r,
                                 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
                                 LOG_UNIT_ID(unit),
                                 LOG_UNIT_INVOCATION_ID(unit),
                                 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
                                                  command->path),
                                 "EXECUTABLE=%s", command->path);
                return 0;
        }

        *exit_status = EXIT_EXEC;
        return log_unit_error_errno(unit, r, "Failed to execute command: %m");
}

static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);

int exec_spawn(Unit *unit,
               ExecCommand *command,
               const ExecContext *context,
               const ExecParameters *params,
               ExecRuntime *runtime,
               DynamicCreds *dcreds,
               pid_t *ret) {

        int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
        _cleanup_strv_free_ char **files_env = NULL;
        size_t n_storage_fds = 0, n_socket_fds = 0;
        _cleanup_free_ char *line = NULL;
        pid_t pid;

        assert(unit);
        assert(command);
        assert(context);
        assert(ret);
        assert(params);
        assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));

        if (context->std_input == EXEC_INPUT_SOCKET ||
            context->std_output == EXEC_OUTPUT_SOCKET ||
            context->std_error == EXEC_OUTPUT_SOCKET) {

                if (params->n_socket_fds > 1) {
                        log_unit_error(unit, "Got more than one socket.");
                        return -EINVAL;
                }

                if (params->n_socket_fds == 0) {
                        log_unit_error(unit, "Got no socket.");
                        return -EINVAL;
                }

                socket_fd = params->fds[0];
        } else {
                socket_fd = -1;
                fds = params->fds;
                n_socket_fds = params->n_socket_fds;
                n_storage_fds = params->n_storage_fds;
        }

        r = exec_context_named_iofds(context, params, named_iofds);
        if (r < 0)
                return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");

        r = exec_context_load_environment(unit, context, &files_env);
        if (r < 0)
                return log_unit_error_errno(unit, r, "Failed to load environment files: %m");

        line = exec_command_line(command->argv);
        if (!line)
                return log_oom();

        log_struct(LOG_DEBUG,
                   LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
                   "EXECUTABLE=%s", command->path,
                   LOG_UNIT_ID(unit),
                   LOG_UNIT_INVOCATION_ID(unit));

        pid = fork();
        if (pid < 0)
                return log_unit_error_errno(unit, errno, "Failed to fork: %m");

        if (pid == 0) {
                int exit_status = EXIT_SUCCESS;

                r = exec_child(unit,
                               command,
                               context,
                               params,
                               runtime,
                               dcreds,
                               socket_fd,
                               named_iofds,
                               fds,
                               n_socket_fds,
                               n_storage_fds,
                               files_env,
                               unit->manager->user_lookup_fds[1],
                               &exit_status);

                if (r < 0)
                        log_struct_errno(LOG_ERR, r,
                                         "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
                                         LOG_UNIT_ID(unit),
                                         LOG_UNIT_INVOCATION_ID(unit),
                                         LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
                                                          exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
                                                          command->path),
                                         "EXECUTABLE=%s", command->path);

                _exit(exit_status);
        }

        log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);

        /* We add the new process to the cgroup both in the child (so
         * that we can be sure that no user code is ever executed
         * outside of the cgroup) and in the parent (so that we can be
         * sure that when we kill the cgroup the process will be
         * killed too). */
        if (params->cgroup_path)
                (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);

        exec_status_start(&command->exec_status, pid);

        *ret = pid;
        return 0;
}

void exec_context_init(ExecContext *c) {
        ExecDirectoryType i;

        assert(c);

        c->umask = 0022;
        c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
        c->cpu_sched_policy = SCHED_OTHER;
        c->syslog_priority = LOG_DAEMON|LOG_INFO;
        c->syslog_level_prefix = true;
        c->ignore_sigpipe = true;
        c->timer_slack_nsec = NSEC_INFINITY;
        c->personality = PERSONALITY_INVALID;
        for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
                c->directories[i].mode = 0755;
        c->capability_bounding_set = CAP_ALL;
        assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
        c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
        c->log_level_max = -1;
}

void exec_context_done(ExecContext *c) {
        ExecDirectoryType i;
        size_t l;

        assert(c);

        c->environment = strv_free(c->environment);
        c->environment_files = strv_free(c->environment_files);
        c->pass_environment = strv_free(c->pass_environment);
        c->unset_environment = strv_free(c->unset_environment);

        rlimit_free_all(c->rlimit);

        for (l = 0; l < 3; l++) {
                c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
                c->stdio_file[l] = mfree(c->stdio_file[l]);
        }

        c->working_directory = mfree(c->working_directory);
        c->root_directory = mfree(c->root_directory);
        c->root_image = mfree(c->root_image);
        c->tty_path = mfree(c->tty_path);
        c->syslog_identifier = mfree(c->syslog_identifier);
        c->user = mfree(c->user);
        c->group = mfree(c->group);

        c->supplementary_groups = strv_free(c->supplementary_groups);

        c->pam_name = mfree(c->pam_name);

        c->read_only_paths = strv_free(c->read_only_paths);
        c->read_write_paths = strv_free(c->read_write_paths);
        c->inaccessible_paths = strv_free(c->inaccessible_paths);

        bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
        c->bind_mounts = NULL;
        c->n_bind_mounts = 0;
        temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
        c->temporary_filesystems = NULL;
        c->n_temporary_filesystems = 0;

        c->cpuset = cpu_set_mfree(c->cpuset);

        c->utmp_id = mfree(c->utmp_id);
        c->selinux_context = mfree(c->selinux_context);
        c->apparmor_profile = mfree(c->apparmor_profile);
        c->smack_process_label = mfree(c->smack_process_label);

        c->syscall_filter = hashmap_free(c->syscall_filter);
        c->syscall_archs = set_free(c->syscall_archs);
        c->address_families = set_free(c->address_families);

        for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
                c->directories[i].paths = strv_free(c->directories[i].paths);

        c->log_level_max = -1;

        exec_context_free_log_extra_fields(c);

        c->stdin_data = mfree(c->stdin_data);
        c->stdin_data_size = 0;
}

int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
        char **i;

        assert(c);

        if (!runtime_prefix)
                return 0;

        STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
                _cleanup_free_ char *p;

                p = strjoin(runtime_prefix, "/", *i);
                if (!p)
                        return -ENOMEM;

                /* We execute this synchronously, since we need to be sure this is gone when we start the service
                 * next. */
                (void) rm_rf(p, REMOVE_ROOT);
        }

        return 0;
}

static void exec_command_done(ExecCommand *c) {
        assert(c);

        c->path = mfree(c->path);
        c->argv = strv_free(c->argv);
}

void exec_command_done_array(ExecCommand *c, size_t n) {
        size_t i;

        for (i = 0; i < n; i++)
                exec_command_done(c+i);
}

ExecCommand* exec_command_free_list(ExecCommand *c) {
        ExecCommand *i;

        while ((i = c)) {
                LIST_REMOVE(command, c, i);
                exec_command_done(i);
                free(i);
        }

        return NULL;
}

void exec_command_free_array(ExecCommand **c, size_t n) {
        size_t i;

        for (i = 0; i < n; i++)
                c[i] = exec_command_free_list(c[i]);
}

void exec_command_reset_status_array(ExecCommand *c, size_t n) {
        size_t i;

        for (i = 0; i < n; i++)
                exec_status_reset(&c[i].exec_status);
}

void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
        size_t i;

        for (i = 0; i < n; i++) {
                ExecCommand *z;

                LIST_FOREACH(command, z, c[i])
                        exec_status_reset(&z->exec_status);
        }
}

typedef struct InvalidEnvInfo {
        const Unit *unit;
        const char *path;
} InvalidEnvInfo;

static void invalid_env(const char *p, void *userdata) {
        InvalidEnvInfo *info = userdata;

        log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
}

const char* exec_context_fdname(const ExecContext *c, int fd_index) {
        assert(c);

        switch (fd_index) {

        case STDIN_FILENO:
                if (c->std_input != EXEC_INPUT_NAMED_FD)
                        return NULL;

                return c->stdio_fdname[STDIN_FILENO] ?: "stdin";

        case STDOUT_FILENO:
                if (c->std_output != EXEC_OUTPUT_NAMED_FD)
                        return NULL;

                return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";

        case STDERR_FILENO:
                if (c->std_error != EXEC_OUTPUT_NAMED_FD)
                        return NULL;

                return c->stdio_fdname[STDERR_FILENO] ?: "stderr";

        default:
                return NULL;
        }
}

static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
        size_t i, targets;
        const char* stdio_fdname[3];
        size_t n_fds;

        assert(c);
        assert(p);

        targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
                  (c->std_output == EXEC_OUTPUT_NAMED_FD) +
                  (c->std_error == EXEC_OUTPUT_NAMED_FD);

        for (i = 0; i < 3; i++)
                stdio_fdname[i] = exec_context_fdname(c, i);

        n_fds = p->n_storage_fds + p->n_socket_fds;

        for (i = 0; i < n_fds  && targets > 0; i++)
                if (named_iofds[STDIN_FILENO] < 0 &&
                    c->std_input == EXEC_INPUT_NAMED_FD &&
                    stdio_fdname[STDIN_FILENO] &&
                    streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {

                        named_iofds[STDIN_FILENO] = p->fds[i];
                        targets--;

                } else if (named_iofds[STDOUT_FILENO] < 0 &&
                           c->std_output == EXEC_OUTPUT_NAMED_FD &&
                           stdio_fdname[STDOUT_FILENO] &&
                           streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {

                        named_iofds[STDOUT_FILENO] = p->fds[i];
                        targets--;

                } else if (named_iofds[STDERR_FILENO] < 0 &&
                           c->std_error == EXEC_OUTPUT_NAMED_FD &&
                           stdio_fdname[STDERR_FILENO] &&
                           streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {

                        named_iofds[STDERR_FILENO] = p->fds[i];
                        targets--;
                }

        return targets == 0 ? 0 : -ENOENT;
}

static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
        char **i, **r = NULL;

        assert(c);
        assert(l);

        STRV_FOREACH(i, c->environment_files) {
                char *fn;
                int k;
                unsigned n;
                bool ignore = false;
                char **p;
                _cleanup_globfree_ glob_t pglob = {};

                fn = *i;

                if (fn[0] == '-') {
                        ignore = true;
                        fn++;
                }

                if (!path_is_absolute(fn)) {
                        if (ignore)
                                continue;

                        strv_free(r);
                        return -EINVAL;
                }

                /* Filename supports globbing, take all matching files */
                k = safe_glob(fn, 0, &pglob);
                if (k < 0) {
                        if (ignore)
                                continue;

                        strv_free(r);
                        return k;
                }

                /* When we don't match anything, -ENOENT should be returned */
                assert(pglob.gl_pathc > 0);

                for (n = 0; n < pglob.gl_pathc; n++) {
                        k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
                        if (k < 0) {
                                if (ignore)
                                        continue;

                                strv_free(r);
                                return k;
                        }
                        /* Log invalid environment variables with filename */
                        if (p) {
                                InvalidEnvInfo info = {
                                        .unit = unit,
                                        .path = pglob.gl_pathv[n]
                                };

                                p = strv_env_clean_with_callback(p, invalid_env, &info);
                        }

                        if (!r)
                                r = p;
                        else {
                                char **m;

                                m = strv_env_merge(2, r, p);
                                strv_free(r);
                                strv_free(p);
                                if (!m)
                                        return -ENOMEM;

                                r = m;
                        }
                }
        }

        *l = r;

        return 0;
}

static bool tty_may_match_dev_console(const char *tty) {
        _cleanup_free_ char *resolved = NULL;

        if (!tty)
                return true;

        tty = skip_dev_prefix(tty);

        /* trivial identity? */
        if (streq(tty, "console"))
                return true;

        if (resolve_dev_console(&resolved) < 0)
                return true; /* if we could not resolve, assume it may */

        /* "tty0" means the active VC, so it may be the same sometimes */
        return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
}

bool exec_context_may_touch_console(const ExecContext *ec) {

        return (ec->tty_reset ||
                ec->tty_vhangup ||
                ec->tty_vt_disallocate ||
                is_terminal_input(ec->std_input) ||
                is_terminal_output(ec->std_output) ||
                is_terminal_output(ec->std_error)) &&
               tty_may_match_dev_console(exec_context_tty_path(ec));
}

static void strv_fprintf(FILE *f, char **l) {
        char **g;

        assert(f);

        STRV_FOREACH(g, l)
                fprintf(f, " %s", *g);
}

void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
        ExecDirectoryType dt;
        char **e, **d;
        unsigned i;
        int r;

        assert(c);
        assert(f);

        prefix = strempty(prefix);

        fprintf(f,
                "%sUMask: %04o\n"
                "%sWorkingDirectory: %s\n"
                "%sRootDirectory: %s\n"
                "%sNonBlocking: %s\n"
                "%sPrivateTmp: %s\n"
                "%sPrivateDevices: %s\n"
                "%sProtectKernelTunables: %s\n"
                "%sProtectKernelModules: %s\n"
                "%sProtectControlGroups: %s\n"
                "%sPrivateNetwork: %s\n"
                "%sPrivateUsers: %s\n"
                "%sProtectHome: %s\n"
                "%sProtectSystem: %s\n"
                "%sMountAPIVFS: %s\n"
                "%sIgnoreSIGPIPE: %s\n"
                "%sMemoryDenyWriteExecute: %s\n"
                "%sRestrictRealtime: %s\n"
                "%sKeyringMode: %s\n",
                prefix, c->umask,
                prefix, c->working_directory ? c->working_directory : "/",
                prefix, c->root_directory ? c->root_directory : "/",
                prefix, yes_no(c->non_blocking),
                prefix, yes_no(c->private_tmp),
                prefix, yes_no(c->private_devices),
                prefix, yes_no(c->protect_kernel_tunables),
                prefix, yes_no(c->protect_kernel_modules),
                prefix, yes_no(c->protect_control_groups),
                prefix, yes_no(c->private_network),
                prefix, yes_no(c->private_users),
                prefix, protect_home_to_string(c->protect_home),
                prefix, protect_system_to_string(c->protect_system),
                prefix, yes_no(c->mount_apivfs),
                prefix, yes_no(c->ignore_sigpipe),
                prefix, yes_no(c->memory_deny_write_execute),
                prefix, yes_no(c->restrict_realtime),
                prefix, exec_keyring_mode_to_string(c->keyring_mode));

        if (c->root_image)
                fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);

        STRV_FOREACH(e, c->environment)
                fprintf(f, "%sEnvironment: %s\n", prefix, *e);

        STRV_FOREACH(e, c->environment_files)
                fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);

        STRV_FOREACH(e, c->pass_environment)
                fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);

        STRV_FOREACH(e, c->unset_environment)
                fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);

        fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));

        for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
                fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);

                STRV_FOREACH(d, c->directories[dt].paths)
                        fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
        }

        if (c->nice_set)
                fprintf(f,
                        "%sNice: %i\n",
                        prefix, c->nice);

        if (c->oom_score_adjust_set)
                fprintf(f,
                        "%sOOMScoreAdjust: %i\n",
                        prefix, c->oom_score_adjust);

        for (i = 0; i < RLIM_NLIMITS; i++)
                if (c->rlimit[i]) {
                        fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
                                prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
                        fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
                                prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
                }

        if (c->ioprio_set) {
                _cleanup_free_ char *class_str = NULL;

                r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
                if (r >= 0)
                        fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);

                fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
        }

        if (c->cpu_sched_set) {
                _cleanup_free_ char *policy_str = NULL;

                r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
                if (r >= 0)
                        fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);

                fprintf(f,
                        "%sCPUSchedulingPriority: %i\n"
                        "%sCPUSchedulingResetOnFork: %s\n",
                        prefix, c->cpu_sched_priority,
                        prefix, yes_no(c->cpu_sched_reset_on_fork));
        }

        if (c->cpuset) {
                fprintf(f, "%sCPUAffinity:", prefix);
                for (i = 0; i < c->cpuset_ncpus; i++)
                        if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
                                fprintf(f, " %u", i);
                fputs("\n", f);
        }

        if (c->timer_slack_nsec != NSEC_INFINITY)
                fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);

        fprintf(f,
                "%sStandardInput: %s\n"
                "%sStandardOutput: %s\n"
                "%sStandardError: %s\n",
                prefix, exec_input_to_string(c->std_input),
                prefix, exec_output_to_string(c->std_output),
                prefix, exec_output_to_string(c->std_error));

        if (c->std_input == EXEC_INPUT_NAMED_FD)
                fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
        if (c->std_output == EXEC_OUTPUT_NAMED_FD)
                fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
        if (c->std_error == EXEC_OUTPUT_NAMED_FD)
                fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);

        if (c->std_input == EXEC_INPUT_FILE)
                fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
        if (c->std_output == EXEC_OUTPUT_FILE)
                fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
        if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
                fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
        if (c->std_error == EXEC_OUTPUT_FILE)
                fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
        if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
                fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);

        if (c->tty_path)
                fprintf(f,
                        "%sTTYPath: %s\n"
                        "%sTTYReset: %s\n"
                        "%sTTYVHangup: %s\n"
                        "%sTTYVTDisallocate: %s\n",
                        prefix, c->tty_path,
                        prefix, yes_no(c->tty_reset),
                        prefix, yes_no(c->tty_vhangup),
                        prefix, yes_no(c->tty_vt_disallocate));

        if (IN_SET(c->std_output,
                   EXEC_OUTPUT_SYSLOG,
                   EXEC_OUTPUT_KMSG,
                   EXEC_OUTPUT_JOURNAL,
                   EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
                   EXEC_OUTPUT_KMSG_AND_CONSOLE,
                   EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
            IN_SET(c->std_error,
                   EXEC_OUTPUT_SYSLOG,
                   EXEC_OUTPUT_KMSG,
                   EXEC_OUTPUT_JOURNAL,
                   EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
                   EXEC_OUTPUT_KMSG_AND_CONSOLE,
                   EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {

                _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;

                r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
                if (r >= 0)
                        fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);

                r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
                if (r >= 0)
                        fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
        }

        if (c->log_level_max >= 0) {
                _cleanup_free_ char *t = NULL;

                (void) log_level_to_string_alloc(c->log_level_max, &t);

                fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
        }

        if (c->n_log_extra_fields > 0) {
                size_t j;

                for (j = 0; j < c->n_log_extra_fields; j++) {
                        fprintf(f, "%sLogExtraFields: ", prefix);
                        fwrite(c->log_extra_fields[j].iov_base,
                               1, c->log_extra_fields[j].iov_len,
                               f);
                        fputc('\n', f);
                }
        }

        if (c->secure_bits) {
                _cleanup_free_ char *str = NULL;

                r = secure_bits_to_string_alloc(c->secure_bits, &str);
                if (r >= 0)
                        fprintf(f, "%sSecure Bits: %s\n", prefix, str);
        }

        if (c->capability_bounding_set != CAP_ALL) {
                _cleanup_free_ char *str = NULL;

                r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
                if (r >= 0)
                        fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
        }

        if (c->capability_ambient_set != 0) {
                _cleanup_free_ char *str = NULL;

                r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
                if (r >= 0)
                        fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
        }

        if (c->user)
                fprintf(f, "%sUser: %s\n", prefix, c->user);
        if (c->group)
                fprintf(f, "%sGroup: %s\n", prefix, c->group);

        fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));

        if (!strv_isempty(c->supplementary_groups)) {
                fprintf(f, "%sSupplementaryGroups:", prefix);
                strv_fprintf(f, c->supplementary_groups);
                fputs("\n", f);
        }

        if (c->pam_name)
                fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);

        if (!strv_isempty(c->read_write_paths)) {
                fprintf(f, "%sReadWritePaths:", prefix);
                strv_fprintf(f, c->read_write_paths);
                fputs("\n", f);
        }

        if (!strv_isempty(c->read_only_paths)) {
                fprintf(f, "%sReadOnlyPaths:", prefix);
                strv_fprintf(f, c->read_only_paths);
                fputs("\n", f);
        }

        if (!strv_isempty(c->inaccessible_paths)) {
                fprintf(f, "%sInaccessiblePaths:", prefix);
                strv_fprintf(f, c->inaccessible_paths);
                fputs("\n", f);
        }

        if (c->n_bind_mounts > 0)
                for (i = 0; i < c->n_bind_mounts; i++)
                        fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
                                c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
                                c->bind_mounts[i].ignore_enoent ? "-": "",
                                c->bind_mounts[i].source,
                                c->bind_mounts[i].destination,
                                c->bind_mounts[i].recursive ? "rbind" : "norbind");

        if (c->n_temporary_filesystems > 0)
                for (i = 0; i < c->n_temporary_filesystems; i++) {
                        TemporaryFileSystem *t = c->temporary_filesystems + i;

                        fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
                                t->path,
                                isempty(t->options) ? "" : ":",
                                strempty(t->options));
                }

        if (c->utmp_id)
                fprintf(f,
                        "%sUtmpIdentifier: %s\n",
                        prefix, c->utmp_id);

        if (c->selinux_context)
                fprintf(f,
                        "%sSELinuxContext: %s%s\n",
                        prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);

        if (c->apparmor_profile)
                fprintf(f,
                        "%sAppArmorProfile: %s%s\n",
                        prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);

        if (c->smack_process_label)
                fprintf(f,
                        "%sSmackProcessLabel: %s%s\n",
                        prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);

        if (c->personality != PERSONALITY_INVALID)
                fprintf(f,
                        "%sPersonality: %s\n",
                        prefix, strna(personality_to_string(c->personality)));

        fprintf(f,
                "%sLockPersonality: %s\n",
                prefix, yes_no(c->lock_personality));

        if (c->syscall_filter) {
#if HAVE_SECCOMP
                Iterator j;
                void *id, *val;
                bool first = true;
#endif

                fprintf(f,
                        "%sSystemCallFilter: ",
                        prefix);

                if (!c->syscall_whitelist)
                        fputc('~', f);

#if HAVE_SECCOMP
                HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
                        _cleanup_free_ char *name = NULL;
                        const char *errno_name = NULL;
                        int num = PTR_TO_INT(val);

                        if (first)
                                first = false;
                        else
                                fputc(' ', f);

                        name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
                        fputs(strna(name), f);

                        if (num >= 0) {
                                errno_name = errno_to_name(num);
                                if (errno_name)
                                        fprintf(f, ":%s", errno_name);
                                else
                                        fprintf(f, ":%d", num);
                        }
                }
#endif

                fputc('\n', f);
        }

        if (c->syscall_archs) {
#if HAVE_SECCOMP
                Iterator j;
                void *id;
#endif

                fprintf(f,
                        "%sSystemCallArchitectures:",
                        prefix);

#if HAVE_SECCOMP
                SET_FOREACH(id, c->syscall_archs, j)
                        fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
#endif
                fputc('\n', f);
        }

        if (exec_context_restrict_namespaces_set(c)) {
                _cleanup_free_ char *s = NULL;

                r = namespace_flags_to_string(c->restrict_namespaces, &s);
                if (r >= 0)
                        fprintf(f, "%sRestrictNamespaces: %s\n",
                                prefix, s);
        }

        if (c->syscall_errno > 0) {
                const char *errno_name;

                fprintf(f, "%sSystemCallErrorNumber: ", prefix);

                errno_name = errno_to_name(c->syscall_errno);
                if (errno_name)
                        fprintf(f, "%s\n", errno_name);
                else
                        fprintf(f, "%d\n", c->syscall_errno);
        }

        if (c->apparmor_profile)
                fprintf(f,
                        "%sAppArmorProfile: %s%s\n",
                        prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
}

bool exec_context_maintains_privileges(const ExecContext *c) {
        assert(c);

        /* Returns true if the process forked off would run under
         * an unchanged UID or as root. */

        if (!c->user)
                return true;

        if (streq(c->user, "root") || streq(c->user, "0"))
                return true;

        return false;
}

int exec_context_get_effective_ioprio(const ExecContext *c) {
        int p;

        assert(c);

        if (c->ioprio_set)
                return c->ioprio;

        p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
        if (p < 0)
                return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);

        return p;
}

void exec_context_free_log_extra_fields(ExecContext *c) {
        size_t l;

        assert(c);

        for (l = 0; l < c->n_log_extra_fields; l++)
                free(c->log_extra_fields[l].iov_base);
        c->log_extra_fields = mfree(c->log_extra_fields);
        c->n_log_extra_fields = 0;
}

void exec_status_start(ExecStatus *s, pid_t pid) {
        assert(s);

        *s = (ExecStatus) {
                .pid = pid,
        };

        dual_timestamp_get(&s->start_timestamp);
}

void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
        assert(s);

        if (s->pid != pid) {
                *s = (ExecStatus) {
                        .pid = pid,
                };
        }

        dual_timestamp_get(&s->exit_timestamp);

        s->code = code;
        s->status = status;

        if (context) {
                if (context->utmp_id)
                        (void) utmp_put_dead_process(context->utmp_id, pid, code, status);

                exec_context_tty_reset(context, NULL);
        }
}

void exec_status_reset(ExecStatus *s) {
        assert(s);

        *s = (ExecStatus) {};
}

void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
        char buf[FORMAT_TIMESTAMP_MAX];

        assert(s);
        assert(f);

        if (s->pid <= 0)
                return;

        prefix = strempty(prefix);

        fprintf(f,
                "%sPID: "PID_FMT"\n",
                prefix, s->pid);

        if (dual_timestamp_is_set(&s->start_timestamp))
                fprintf(f,
                        "%sStart Timestamp: %s\n",
                        prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));

        if (dual_timestamp_is_set(&s->exit_timestamp))
                fprintf(f,
                        "%sExit Timestamp: %s\n"
                        "%sExit Code: %s\n"
                        "%sExit Status: %i\n",
                        prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
                        prefix, sigchld_code_to_string(s->code),
                        prefix, s->status);
}

static char *exec_command_line(char **argv) {
        size_t k;
        char *n, *p, **a;
        bool first = true;

        assert(argv);

        k = 1;
        STRV_FOREACH(a, argv)
                k += strlen(*a)+3;

        n = new(char, k);
        if (!n)
                return NULL;

        p = n;
        STRV_FOREACH(a, argv) {

                if (!first)
                        *(p++) = ' ';
                else
                        first = false;

                if (strpbrk(*a, WHITESPACE)) {
                        *(p++) = '\'';
                        p = stpcpy(p, *a);
                        *(p++) = '\'';
                } else
                        p = stpcpy(p, *a);

        }

        *p = 0;

        /* FIXME: this doesn't really handle arguments that have
         * spaces and ticks in them */

        return n;
}

static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
        _cleanup_free_ char *cmd = NULL;
        const char *prefix2;

        assert(c);
        assert(f);

        prefix = strempty(prefix);
        prefix2 = strjoina(prefix, "\t");

        cmd = exec_command_line(c->argv);
        fprintf(f,
                "%sCommand Line: %s\n",
                prefix, cmd ? cmd : strerror(ENOMEM));

        exec_status_dump(&c->exec_status, f, prefix2);
}

void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
        assert(f);

        prefix = strempty(prefix);

        LIST_FOREACH(command, c, c)
                exec_command_dump(c, f, prefix);
}

void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
        ExecCommand *end;

        assert(l);
        assert(e);

        if (*l) {
                /* It's kind of important, that we keep the order here */
                LIST_FIND_TAIL(command, *l, end);
                LIST_INSERT_AFTER(command, *l, end, e);
        } else
              *l = e;
}

int exec_command_set(ExecCommand *c, const char *path, ...) {
        va_list ap;
        char **l, *p;

        assert(c);
        assert(path);

        va_start(ap, path);
        l = strv_new_ap(path, ap);
        va_end(ap);

        if (!l)
                return -ENOMEM;

        p = strdup(path);
        if (!p) {
                strv_free(l);
                return -ENOMEM;
        }

        free(c->path);
        c->path = p;

        return strv_free_and_replace(c->argv, l);
}

int exec_command_append(ExecCommand *c, const char *path, ...) {
        _cleanup_strv_free_ char **l = NULL;
        va_list ap;
        int r;

        assert(c);
        assert(path);

        va_start(ap, path);
        l = strv_new_ap(path, ap);
        va_end(ap);

        if (!l)
                return -ENOMEM;

        r = strv_extend_strv(&c->argv, l, false);
        if (r < 0)
                return r;

        return 0;
}

static void *remove_tmpdir_thread(void *p) {
        _cleanup_free_ char *path = p;

        (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
        return NULL;
}

static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
        int r;

        if (!rt)
                return NULL;

        if (rt->manager)
                (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);

        /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
        if (destroy && rt->tmp_dir) {
                log_debug("Spawning thread to nuke %s", rt->tmp_dir);

                r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
                if (r < 0) {
                        log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
                        free(rt->tmp_dir);
                }

                rt->tmp_dir = NULL;
        }

        if (destroy && rt->var_tmp_dir) {
                log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);

                r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
                if (r < 0) {
                        log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
                        free(rt->var_tmp_dir);
                }

                rt->var_tmp_dir = NULL;
        }

        rt->id = mfree(rt->id);
        rt->tmp_dir = mfree(rt->tmp_dir);
        rt->var_tmp_dir = mfree(rt->var_tmp_dir);
        safe_close_pair(rt->netns_storage_socket);
        return mfree(rt);
}

static void exec_runtime_freep(ExecRuntime **rt) {
        if (*rt)
                (void) exec_runtime_free(*rt, false);
}

static int exec_runtime_allocate(ExecRuntime **rt) {
        assert(rt);

        *rt = new0(ExecRuntime, 1);
        if (!*rt)
                return -ENOMEM;

        (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
        return 0;
}

static int exec_runtime_add(
                Manager *m,
                const char *id,
                const char *tmp_dir,
                const char *var_tmp_dir,
                const int netns_storage_socket[2],
                ExecRuntime **ret) {

        _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
        int r;

        assert(m);
        assert(id);

        r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
        if (r < 0)
                return r;

        r = exec_runtime_allocate(&rt);
        if (r < 0)
                return r;

        rt->id = strdup(id);
        if (!rt->id)
                return -ENOMEM;

        if (tmp_dir) {
                rt->tmp_dir = strdup(tmp_dir);
                if (!rt->tmp_dir)
                        return -ENOMEM;

                /* When tmp_dir is set, then we require var_tmp_dir is also set. */
                assert(var_tmp_dir);
                rt->var_tmp_dir = strdup(var_tmp_dir);
                if (!rt->var_tmp_dir)
                        return -ENOMEM;
        }

        if (netns_storage_socket) {
                rt->netns_storage_socket[0] = netns_storage_socket[0];
                rt->netns_storage_socket[1] = netns_storage_socket[1];
        }

        r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
        if (r < 0)
                return r;

        rt->manager = m;

        if (ret)
                *ret = rt;

        /* do not remove created ExecRuntime object when the operation succeeds. */
        rt = NULL;
        return 0;
}

static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
        _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
        _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
        int r;

        assert(m);
        assert(c);
        assert(id);

        /* It is not necessary to create ExecRuntime object. */
        if (!c->private_network && !c->private_tmp)
                return 0;

        if (c->private_tmp) {
                r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
                if (r < 0)
                        return r;
        }

        if (c->private_network) {
                if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
                        return -errno;
        }

        r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
        if (r < 0)
                return r;

        /* Avoid cleanup */
        netns_storage_socket[0] = -1;
        netns_storage_socket[1] = -1;
        return 1;
}

int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
        ExecRuntime *rt;
        int r;

        assert(m);
        assert(id);
        assert(ret);

        rt = hashmap_get(m->exec_runtime_by_id, id);
        if (rt)
                /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
                goto ref;

        if (!create)
                return 0;

        /* If not found, then create a new object. */
        r = exec_runtime_make(m, c, id, &rt);
        if (r <= 0)
                /* When r == 0, it is not necessary to create ExecRuntime object. */
                return r;

ref:
        /* increment reference counter. */
        rt->n_ref++;
        *ret = rt;
        return 1;
}

ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
        if (!rt)
                return NULL;

        assert(rt->n_ref > 0);

        rt->n_ref--;
        if (rt->n_ref > 0)
                return NULL;

        return exec_runtime_free(rt, destroy);
}

int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
        ExecRuntime *rt;
        Iterator i;

        assert(m);
        assert(f);
        assert(fds);

        HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
                fprintf(f, "exec-runtime=%s", rt->id);

                if (rt->tmp_dir)
                        fprintf(f, " tmp-dir=%s", rt->tmp_dir);

                if (rt->var_tmp_dir)
                        fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);

                if (rt->netns_storage_socket[0] >= 0) {
                        int copy;

                        copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
                        if (copy < 0)
                                return copy;

                        fprintf(f, " netns-socket-0=%i", copy);
                }

                if (rt->netns_storage_socket[1] >= 0) {
                        int copy;

                        copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
                        if (copy < 0)
                                return copy;

                        fprintf(f, " netns-socket-1=%i", copy);
                }

                fputc('\n', f);
        }

        return 0;
}

int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
        _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
        ExecRuntime *rt;
        int r;

        /* This is for the migration from old (v237 or earlier) deserialization text.
         * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
         * Even if the ExecRuntime object originally created by the other unit, we cannot judge
         * so or not from the serialized text, then we always creates a new object owned by this. */

        assert(u);
        assert(key);
        assert(value);

        /* Manager manages ExecRuntime objects by the unit id.
         * So, we omit the serialized text when the unit does not have id (yet?)... */
        if (isempty(u->id)) {
                log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
                return 0;
        }

        r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
        if (r < 0) {
                log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
                return 0;
        }

        rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
        if (!rt) {
                r = exec_runtime_allocate(&rt_create);
                if (r < 0)
                        return log_oom();

                rt_create->id = strdup(u->id);
                if (!rt_create->id)
                        return log_oom();

                rt = rt_create;
        }

        if (streq(key, "tmp-dir")) {
                char *copy;

                copy = strdup(value);
                if (!copy)
                        return log_oom();

                free_and_replace(rt->tmp_dir, copy);

        } else if (streq(key, "var-tmp-dir")) {
                char *copy;

                copy = strdup(value);
                if (!copy)
                        return log_oom();

                free_and_replace(rt->var_tmp_dir, copy);

        } else if (streq(key, "netns-socket-0")) {
                int fd;

                if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
                        log_unit_debug(u, "Failed to parse netns socket value: %s", value);
                        return 0;
                }

                safe_close(rt->netns_storage_socket[0]);
                rt->netns_storage_socket[0] = fdset_remove(fds, fd);

        } else if (streq(key, "netns-socket-1")) {
                int fd;

                if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
                        log_unit_debug(u, "Failed to parse netns socket value: %s", value);
                        return 0;
                }

                safe_close(rt->netns_storage_socket[1]);
                rt->netns_storage_socket[1] = fdset_remove(fds, fd);
        } else
                return 0;

        /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
        if (rt_create) {
                r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
                if (r < 0) {
                        log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
                        return 0;
                }

                rt_create->manager = u->manager;

                /* Avoid cleanup */
                rt_create = NULL;
        }

        return 1;
}

void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
        char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
        int r, fd0 = -1, fd1 = -1;
        const char *p, *v = value;
        size_t n;

        assert(m);
        assert(value);
        assert(fds);

        n = strcspn(v, " ");
        id = strndupa(v, n);
        if (v[n] != ' ')
                goto finalize;
        p = v + n + 1;

        v = startswith(p, "tmp-dir=");
        if (v) {
                n = strcspn(v, " ");
                tmp_dir = strndupa(v, n);
                if (v[n] != ' ')
                        goto finalize;
                p = v + n + 1;
        }

        v = startswith(p, "var-tmp-dir=");
        if (v) {
                n = strcspn(v, " ");
                var_tmp_dir = strndupa(v, n);
                if (v[n] != ' ')
                        goto finalize;
                p = v + n + 1;
        }

        v = startswith(p, "netns-socket-0=");
        if (v) {
                char *buf;

                n = strcspn(v, " ");
                buf = strndupa(v, n);
                if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
                        log_debug("Unable to process exec-runtime netns fd specification.");
                        return;
                }
                fd0 = fdset_remove(fds, fd0);
                if (v[n] != ' ')
                        goto finalize;
                p = v + n + 1;
        }

        v = startswith(p, "netns-socket-1=");
        if (v) {
                char *buf;

                n = strcspn(v, " ");
                buf = strndupa(v, n);
                if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
                        log_debug("Unable to process exec-runtime netns fd specification.");
                        return;
                }
                fd1 = fdset_remove(fds, fd1);
        }

finalize:

        r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
        if (r < 0) {
                log_debug_errno(r, "Failed to add exec-runtime: %m");
                return;
        }
}

void exec_runtime_vacuum(Manager *m) {
        ExecRuntime *rt;
        Iterator i;

        assert(m);

        /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */

        HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
                if (rt->n_ref > 0)
                        continue;

                (void) exec_runtime_free(rt, false);
        }
}

static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
        [EXEC_INPUT_NULL] = "null",
        [EXEC_INPUT_TTY] = "tty",
        [EXEC_INPUT_TTY_FORCE] = "tty-force",
        [EXEC_INPUT_TTY_FAIL] = "tty-fail",
        [EXEC_INPUT_SOCKET] = "socket",
        [EXEC_INPUT_NAMED_FD] = "fd",
        [EXEC_INPUT_DATA] = "data",
        [EXEC_INPUT_FILE] = "file",
};

DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);

static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
        [EXEC_OUTPUT_INHERIT] = "inherit",
        [EXEC_OUTPUT_NULL] = "null",
        [EXEC_OUTPUT_TTY] = "tty",
        [EXEC_OUTPUT_SYSLOG] = "syslog",
        [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
        [EXEC_OUTPUT_KMSG] = "kmsg",
        [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
        [EXEC_OUTPUT_JOURNAL] = "journal",
        [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
        [EXEC_OUTPUT_SOCKET] = "socket",
        [EXEC_OUTPUT_NAMED_FD] = "fd",
        [EXEC_OUTPUT_FILE] = "file",
        [EXEC_OUTPUT_FILE_APPEND] = "append",
};

DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);

static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
        [EXEC_UTMP_INIT] = "init",
        [EXEC_UTMP_LOGIN] = "login",
        [EXEC_UTMP_USER] = "user",
};

DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);

static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
        [EXEC_PRESERVE_NO] = "no",
        [EXEC_PRESERVE_YES] = "yes",
        [EXEC_PRESERVE_RESTART] = "restart",
};

DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);

static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
        [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
        [EXEC_DIRECTORY_STATE] = "StateDirectory",
        [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
        [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
        [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
};

DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);

static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
        [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
        [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
        [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
        [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
        [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
};

DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);

static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
        [EXEC_KEYRING_INHERIT] = "inherit",
        [EXEC_KEYRING_PRIVATE] = "private",
        [EXEC_KEYRING_SHARED] = "shared",
};

DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);
-												Add SPDX license identifiers to source files under the LGPL

This follows what the kernel is doing, c.f.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5fd54ace4721fc5ce2bb5aef6318fcf17f421460.

											
										
										
											2017-11-18 17:09:20 +01:00
+								/* SPDX-License-Identifier: LGPL-2.1+ */
-												license: add GPLv2+ license blurbs everwhere

											
										
										
											2010-02-03 13:03:47 +01:00
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								#include <errno.h>
 								#include <fcntl.h>
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include <glob.h>
 								#include <grp.h>
 								#include <poll.h>
-												reset signal mask when forking

											
										
										
											2010-01-27 06:17:51 +01:00
+								#include <signal.h>
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include <string.h>
-												core: set NoNewPrivileges for seccomp if we don't have CAP_SYS_ADMIN

The manpage of seccomp specify that using seccomp with
SECCOMP_SET_MODE_FILTER will return EACCES if the caller do not have
CAP_SYS_ADMIN set, or if the no_new_privileges bit is not set. Hence,
without NoNewPrivilege set, it is impossible to use a SystemCall*
directive with a User directive set in system mode.

Now, NoNewPrivileges is set if we are in user mode, or if we are in
system mode and we don't have CAP_SYS_ADMIN, and SystemCall*
directives are used.

											
										
										
											2016-01-30 17:26:39 +01:00
+								#include <sys/capability.h>
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								#include <sys/eventfd.h>
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
+								#include <sys/mman.h>
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include <sys/personality.h>
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								#include <sys/prctl.h>
-												seccomp: also block shmat(..., SHM_EXEC) for MemoryDenyWriteExecute

shmat(..., SHM_EXEC) can be used to create writable and executable
memory, so let's block it when MemoryDenyWriteExecute is set.

											
										
										
											2016-10-26 17:52:53 +02:00
+								#include <sys/shm.h>
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include <sys/socket.h>
-												execute: allow configuration of O_NONBLOCK flag from .service files

											
										
										
											2010-02-12 02:00:18 +01:00
+								#include <sys/stat.h>
-												seccomp: also block shmat(..., SHM_EXEC) for MemoryDenyWriteExecute

shmat(..., SHM_EXEC) can be used to create writable and executable
memory, so let's block it when MemoryDenyWriteExecute is set.

											
										
										
											2016-10-26 17:52:53 +02:00
+								#include <sys/types.h>
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include <sys/un.h>
 								#include <unistd.h>
-												core: optionally create LOGIN_PROCESS or USER_PROCESS utmp entries

When generating utmp/wtmp entries, optionally add both LOGIN_PROCESS and
INIT_PROCESS entries or even all three of LOGIN_PROCESS, INIT_PROCESS
and USER_PROCESS entries, instead of just a single INIT_PROCESS entry.

With this change systemd may be used to not only invoke a getty directly
in a SysV-compliant way but alternatively also a login(1) implementation
or even forego getty and login entirely, and invoke arbitrary shells in
a way that they appear in who(1) or w(1).

This is preparation for a later commit that adds a "machinectl shell"
operation to invoke a shell in a container, in a way that is compatible
with who(1) and w(1).

											
										
										
											2015-08-23 13:14:04 +02:00
+								#include <utmpx.h>
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_PAM
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								#include <security/pam_appl.h>
 								#endif
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SELINUX
-												exec: Add SELinuxContext configuration item

This permit to let system administrators decide of the domain of a service.
This can be used with templated units to have each service in a différent
domain ( for example, a per customer database, using MLS or anything ),
or can be used to force a non selinux enabled system (jvm, erlang, etc)
to start in a different domain for each service.

											
										
										
											2014-02-06 10:05:16 +01:00
+								#include <selinux/selinux.h>
 								#endif
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								#include <seccomp.h>
 								#endif
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_APPARMOR
-												core: Add AppArmor profile switching

This permit to switch to a specific apparmor profile when starting a daemon. This
will result in a non operation if apparmor is disabled.
It also add a new build requirement on libapparmor for using this feature.

											
										
										
											2014-02-20 16:19:44 +01:00
+								#include <sys/apparmor.h>
 								#endif
-												util: split out signal-util.[ch] from util.[ch]

No functional changes.

											
										
										
											2015-05-29 20:14:11 +02:00
+								#include "sd-messages.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
 								#include "af-list.h"
-												util-lib: split out allocation calls into alloc-util.[ch]

											
										
										
											2015-10-27 03:01:06 +01:00
+								#include "alloc-util.h"
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_APPARMOR
-												util-lib: split out fd-related operations into fd-util.[ch]

There are more than enough to deserve their own .c file, hence move them
over.

											
										
										
											2015-10-25 13:14:12 +01:00
+								#include "apparmor-util.h"
 								#endif
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "async.h"
 								#include "barrier.h"
 								#include "cap-list.h"
-												src/basic: rename audit.[ch] → audit-util.[ch] and capability.[ch] → capability-util.[ch]

The files are named too generically, so that they might conflict with
the upstream project headers. Hence, let's add a "-util" suffix, to
clarify that this are just our utility headers and not any official
upstream headers.

											
										
										
											2015-10-26 23:32:16 +01:00
+								#include "capability-util.h"
-												core: chown() StateDirectory= and friends recursively when starting a service

This is particularly useful when used in conjunction with DynamicUser=1,
where the UID might change for every invocation, but is useful in other
cases too, for example, when these directories are shared between
systems where the UID assignments differ slightly.

											
										
										
											2017-09-28 19:13:44 +02:00
+								#include "chown-recursive.h"
-												tree-wide: use cpu_set_mfree()

											
										
										
											2017-11-30 15:23:16 +01:00
+								#include "cpu-set-util.h"
-												def: centralize definition of default timeout in one place

											
										
										
											2011-03-17 04:02:35 +01:00
+								#include "def.h"
-												env: considerably beef up environment cleaning logic

Now, actually check if the environment variable names and values used
are valid, before accepting them. With this in place are at some places
more rigid than POSIX, and less rigid at others. For example, this code
allows lower-case environment variables (which POSIX suggests not to
use), but it will not allow non-UTF8 variable values.

All in all this should be a good middle ground of what to allow and what
not to allow as environment variables.

(This also splits out all environment related calls into env-util.[ch])

											
										
										
											2013-02-11 03:46:08 +01:00
+								#include "env-util.h"
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								#include "errno-list.h"
-												util-lib: split out fd-related operations into fd-util.[ch]

There are more than enough to deserve their own .c file, hence move them
over.

											
										
										
											2015-10-25 13:14:12 +01:00
+								#include "execute.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "exit-status.h"
-												util-lib: split out fd-related operations into fd-util.[ch]

There are more than enough to deserve their own .c file, hence move them
over.

											
										
										
											2015-10-25 13:14:12 +01:00
+								#include "fd-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "fileio.h"
-												Rename formats-util.h to format-util.h

We don't have plural in the name of any other -util files and this
inconsistency trips me up every time I try to type this file name
from memory. "formats-util" is even hard to pronounce.

											
										
										
											2016-11-07 16:14:59 +01:00
+								#include "format-util.h"
-												util-lib: move a number of fs operations into fs-util.[ch]

											
										
										
											2015-10-26 21:16:26 +01:00
+								#include "fs-util.h"
-												util-lib: split out globbing related calls into glob-util.[ch]

											
										
										
											2015-10-27 01:48:17 +01:00
+								#include "glob-util.h"
-												util-lib: split out IO related calls to io-util.[ch]

											
										
										
											2015-10-25 14:08:25 +01:00
+								#include "io-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "ioprio.h"
-												core: chown() StateDirectory= and friends recursively when starting a service

This is particularly useful when used in conjunction with DynamicUser=1,
where the UID might change for every invocation, but is useful in other
cases too, for example, when these directories are shared between
systems where the UID assignments differ slightly.

											
										
										
											2017-09-28 19:13:44 +02:00
+								#include "label.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "log.h"
 								#include "macro.h"
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								#include "manager.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "missing.h"
 								#include "mkdir.h"
 								#include "namespace.h"
-												util-lib: split string parsing related calls from util.[ch] into parse-util.[ch]

											
										
										
											2015-10-26 16:18:16 +01:00
+								#include "parse-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "path-util.h"
-												shared: add process-util.[ch]

											
										
										
											2015-04-10 19:10:00 +02:00
+								#include "process-util.h"
-												util-lib: split out resource limits related calls into rlimit-util.[ch]

											
										
										
											2015-10-26 19:40:43 +01:00
+								#include "rlimit-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "rm-rf.h"
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												util-lib: split out fd-related operations into fd-util.[ch]

There are more than enough to deserve their own .c file, hence move them
over.

											
										
										
											2015-10-25 13:14:12 +01:00
+								#include "seccomp-util.h"
 								#endif
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "securebits.h"
-												securebits-util: add secure_bits_{from_string,to_string_alloc}()

											
										
										
											2017-08-07 16:40:25 +02:00
+								#include "securebits-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "selinux-util.h"
-												util: split out signal-util.[ch] from util.[ch]

No functional changes.

											
										
										
											2015-05-29 20:14:11 +02:00
+								#include "signal-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "smack-util.h"
-												core: undo the dependency inversion between unit.h and all unit types

											
										
										
											2018-05-15 20:17:34 +02:00
+								#include "socket-util.h"
-												core: bypass dynamic user lookups from dbus-daemon

dbus-daemon does NSS name look-ups in order to enforce its bus policy. This
might dead-lock if an NSS module use wants to use D-Bus for the look-up itself,
like our nss-systemd does. Let's work around this by bypassing bus
communication in the NSS module if we run inside of dbus-daemon. To make this
work we keep a bit of extra state in /run/systemd/dynamic-uid/ so that we don't
have to consult the bus, but can still resolve the names.

Note that the normal codepath continues to be via the bus, so that resolving
works from all mount namespaces and is subject to authentication, as before.

This is a bit dirty, but not too dirty, as dbus daemon is kinda special anyway
for PID 1.

											
										
										
											2016-08-02 12:28:51 +02:00
+								#include "special.h"
-												core: support upgrading from DynamicUser=0 to DynamicUser=1 for unit directories (#7507)

This makes sure we migrate /var/lib/<foo> if it exists to
/var/lib/private/<foo> if DynamicUser=1 is set. This is useful to allow
turning on DynamicUser= on services that previously didn't use it, and
we can deal with this, and migrate the relevant directories as
necessary.

Note that "downgrading" from DynamicUser=1 backto DynamicUser=0 works
too. However in that case we simply continue to use
/var/lib/private/<foo>, which works because /var/lib/<foo> is a symlink
there after all.
											
										
										
											2017-11-30 11:52:39 +01:00
+								#include "stat-util.h"
-												util-lib: move string table stuff into its own string-table.[ch]

											
										
										
											2015-10-26 22:31:05 +01:00
+								#include "string-table.h"
-												util-lib: split our string related calls from util.[ch] into its own file string-util.[ch]

There are more than enough calls doing string manipulations to deserve
its own files, hence do something about it.

This patch also sorts the #include blocks of all files that needed to be
updated, according to the sorting suggestions from CODING_STYLE. Since
pretty much every file needs our string manipulation functions this
effectively means that most files have sorted #include blocks now.

Also touches a few unrelated include files.

											
										
										
											2015-10-24 22:58:24 +02:00
+								#include "string-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "strv.h"
-												util-lib: split out syslog-related calls into syslog-util.[ch]

											
										
										
											2015-10-27 00:40:25 +01:00
+								#include "syslog-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "terminal-util.h"
-												Add support for opening files for appending

Addresses part of #8983

											
										
										
											2018-07-03 21:22:29 +02:00
+								#include "umask-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "unit.h"
-												util-lib: split out user/group/uid/gid calls into user-util.[ch]

											
										
										
											2015-10-25 22:32:30 +01:00
+								#include "user-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "util.h"
 								#include "utmp-wtmp.h"
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
-												service: for Type=idle units consider START_PRE, START, START_POST all as ACTIVE

We want to avoid a deadlock when a service has ExecStartPre= programs
that wait for the job queue to run empty because of Type=idle, but which
themselves keep the queue non-empty because START_PRE was considered
ACTIVATING and hence the job not complete. With this patch we alter the
state translation table so that it is impossible ever to wait for
Type=idle unit, hence removing the deadlock.

											
										
										
											2012-05-24 02:22:35 +02:00
+								#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
+								#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
-												execute: use a much lower idle timeout that default time

The idle timeout after all is for cosmetics only, hence avoid any
substantial delays just for it.

											
										
										
											2012-05-22 19:26:13 +02:00
-												execute: chown() the tty when running owning them

											
										
										
											2010-04-13 18:50:43 +02:00
+								/* This assumes there is a 'tty' group */
 								#define TTY_MODE 0620
-												execute: also set SO_SNDBUF when spawning a service with stdout/stderr connected to journald

											
										
										
											2013-12-16 20:00:09 +01:00
+								#define SNDBUF_SIZE (8*1024*1024)
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								static int shift_fds(int fds[], size_t n_fds) {
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        int start, restart_from;
 								        if (n_fds <= 0)
 								                return 0;
-												util: move close_all_fds() to util.c

											
										
										
											2010-04-06 23:35:59 +02:00
+								        /* Modifies the fds array! (sorts it) */
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        assert(fds);
 								        start = 0;
 								        for (;;) {
 								                int i;
 								                restart_from = -1;
 								                for (i = start; i < (int) n_fds; i++) {
 								                        int nfd;
 								                        /* Already at right index? */
 								                        if (fds[i] == i+3)
 								                                continue;
-												tree-wide: don't do assignments within if checks

Turn this:

       if ((r = foo()) < 0) { ...

into this:

       r = foo();
       if (r < 0) { ...

											
										
										
											2015-09-08 19:14:10 +02:00
+								                        nfd = fcntl(fds[i], F_DUPFD, i + 3);
 								                        if (nfd < 0)
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								                                return -errno;
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								                        safe_close(fds[i]);
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								                        fds[i] = nfd;
 								                        /* Hmm, the fd we wanted isn't free? Then
-												core: correct spacing near eol in code comments

											
										
										
											2014-12-10 20:00:08 +01:00
+								                         * let's remember that and try again from here */
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								                        if (nfd != i+3 && restart_from < 0)
 								                                restart_from = i;
 								                }
 								                if (restart_from < 0)
 								                        break;
 								                start = restart_from;
 								        }
 								        return 0;
 								}
-												core: swap order of "n_storage_fds" and "n_socket_fds" parameters

When process fd lists to pass to activated programs we always place the
socket activation fds first, and the storage fds last. Irritatingly in
almost all calls the "n_storage_fds" parameter (i.e. the number of
storage fds to pass) came first so far, and the "n_socket_fds" parameter
second. Let's clean this up, and specify the number of fds in the order
the fds themselves are passed.

(Also, let's fix one more case where "unsigned" was used to size an
array, while we should use "size_t" instead.)

											
										
										
											2018-07-05 09:56:54 +02:00
+								static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								        size_t i, n_fds;
-												execute: use fd_nonblock()//fd_cloexec() where applicable

											
										
										
											2010-04-06 21:53:39 +02:00
+								        int r;
-												drop O_CLOEXEC/O_NONBLOCK from files intended for forked clients

											
										
										
											2010-01-27 06:18:45 +01:00
-												core: swap order of "n_storage_fds" and "n_socket_fds" parameters

When process fd lists to pass to activated programs we always place the
socket activation fds first, and the storage fds last. Irritatingly in
almost all calls the "n_storage_fds" parameter (i.e. the number of
storage fds to pass) came first so far, and the "n_socket_fds" parameter
second. Let's clean this up, and specify the number of fds in the order
the fds themselves are passed.

(Also, let's fix one more case where "unsigned" was used to size an
array, while we should use "size_t" instead.)

											
										
										
											2018-07-05 09:56:54 +02:00
+								        n_fds = n_socket_fds + n_storage_fds;
-												drop O_CLOEXEC/O_NONBLOCK from files intended for forked clients

											
										
										
											2010-01-27 06:18:45 +01:00
+								        if (n_fds <= 0)
 								                return 0;
 								        assert(fds);
-												core: only apply NonBlocking= to fds passed via socket activation

Make sure to only apply the O_NONBLOCK flag to the fds passed via socket
activation.

Previously the flag was also applied to the fds which came from the fd store
but this was incorrect since services, after being restarted, expect that these
passed fds have their flags unchanged and can be reused as before.

The documentation was a bit unclear about this so clarify it.

											
										
										
											2017-05-12 11:32:53 +02:00
+								        /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 								         * O_NONBLOCK only applies to socket activation though. */
-												drop O_CLOEXEC/O_NONBLOCK from files intended for forked clients

											
										
										
											2010-01-27 06:18:45 +01:00
 								        for (i = 0; i < n_fds; i++) {
-												core: only apply NonBlocking= to fds passed via socket activation

Make sure to only apply the O_NONBLOCK flag to the fds passed via socket
activation.

Previously the flag was also applied to the fds which came from the fd store
but this was incorrect since services, after being restarted, expect that these
passed fds have their flags unchanged and can be reused as before.

The documentation was a bit unclear about this so clarify it.

											
										
										
											2017-05-12 11:32:53 +02:00
+								                if (i < n_socket_fds) {
 								                        r = fd_nonblock(fds[i], nonblock);
 								                        if (r < 0)
 								                                return r;
 								                }
-												drop O_CLOEXEC/O_NONBLOCK from files intended for forked clients

											
										
										
											2010-01-27 06:18:45 +01:00
-												execute: allow configuration of O_NONBLOCK flag from .service files

											
										
										
											2010-02-12 02:00:18 +01:00
+								                /* We unconditionally drop FD_CLOEXEC from the fds,
 								                 * since after all we want to pass these fds to our
 								                 * children */
-												drop O_CLOEXEC/O_NONBLOCK from files intended for forked clients

											
										
										
											2010-01-27 06:18:45 +01:00
-												tree-wide: don't do assignments within if checks

Turn this:

       if ((r = foo()) < 0) { ...

into this:

       r = foo();
       if (r < 0) { ...

											
										
										
											2015-09-08 19:14:10 +02:00
+								                r = fd_cloexec(fds[i], false);
 								                if (r < 0)
-												execute: use fd_nonblock()//fd_cloexec() where applicable

											
										
										
											2010-04-06 21:53:39 +02:00
+								                        return r;
-												drop O_CLOEXEC/O_NONBLOCK from files intended for forked clients

											
										
										
											2010-01-27 06:18:45 +01:00
+								        }
 								        return 0;
 								}
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								static const char *exec_context_tty_path(const ExecContext *context) {
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(context);
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								        if (context->stdio_as_fds)
 								                return NULL;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        if (context->tty_path)
 								                return context->tty_path;
 								        return "/dev/console";
 								}
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 								        const char *path;
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
+								        assert(context);
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								        path = exec_context_tty_path(context);
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								        if (context->tty_vhangup) {
 								                if (p && p->stdin_fd >= 0)
 								                        (void) terminal_vhangup_fd(p->stdin_fd);
 								                else if (path)
 								                        (void) terminal_vhangup(path);
 								        }
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								        if (context->tty_reset) {
 								                if (p && p->stdin_fd >= 0)
 								                        (void) reset_terminal_fd(p->stdin_fd, true);
 								                else if (path)
 								                        (void) reset_terminal(path);
 								        }
 								        if (context->tty_vt_disallocate && path)
 								                (void) vt_disallocate(path);
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
+								}
-												core: inherit TERM from PID 1 for all services started on /dev/console

This way, invoking nspawn from a shell in the best case inherits the TERM
setting all the way down into the login shell spawned in the container.

Fixes: #3697

											
										
										
											2016-07-27 15:25:55 +02:00
+								static bool is_terminal_input(ExecInput i) {
 								        return IN_SET(i,
 								                      EXEC_INPUT_TTY,
 								                      EXEC_INPUT_TTY_FORCE,
 								                      EXEC_INPUT_TTY_FAIL);
 								}
-												core/execute: add internal is_terminal_output()

Similar to already existing is_terminal_input().

Note that the only current user (connect_logger_as) is never called
for EXEC_OUTPUT_TTY, so it won't mind whether we accept it.

											
										
										
											2013-02-28 01:35:47 +01:00
+								static bool is_terminal_output(ExecOutput o) {
-												core: inherit TERM from PID 1 for all services started on /dev/console

This way, invoking nspawn from a shell in the best case inherits the TERM
setting all the way down into the login shell spawned in the container.

Fixes: #3697

											
										
										
											2016-07-27 15:25:55 +02:00
+								        return IN_SET(o,
 								                      EXEC_OUTPUT_TTY,
 								                      EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 								                      EXEC_OUTPUT_KMSG_AND_CONSOLE,
 								                      EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 								}
-												execute: minor ExecOutput handling beautification (#6711)

Let's clean up the checking for the various ExecOutput values a bit,
let's use IN_SET everywhere, and the same concepts for all three bools
we pass to dprintf().
											
										
										
											2017-09-01 02:04:27 +02:00
+								static bool is_syslog_output(ExecOutput o) {
 								        return IN_SET(o,
 								                      EXEC_OUTPUT_SYSLOG,
 								                      EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 								}
 								static bool is_kmsg_output(ExecOutput o) {
 								        return IN_SET(o,
 								                      EXEC_OUTPUT_KMSG,
 								                      EXEC_OUTPUT_KMSG_AND_CONSOLE);
 								}
-												core: inherit TERM from PID 1 for all services started on /dev/console

This way, invoking nspawn from a shell in the best case inherits the TERM
setting all the way down into the login shell spawned in the container.

Fixes: #3697

											
										
										
											2016-07-27 15:25:55 +02:00
+								static bool exec_context_needs_term(const ExecContext *c) {
 								        assert(c);
 								        /* Return true if the execution context suggests we should set $TERM to something useful. */
 								        if (is_terminal_input(c->std_input))
 								                return true;
 								        if (is_terminal_output(c->std_output))
 								                return true;
 								        if (is_terminal_output(c->std_error))
 								                return true;
 								        return !!c->tty_path;
-												core/execute: add internal is_terminal_output()

Similar to already existing is_terminal_input().

Note that the only current user (connect_logger_as) is never called
for EXEC_OUTPUT_TTY, so it won't mind whether we accept it.

											
										
										
											2013-02-28 01:35:47 +01:00
+								}
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								static int open_null_as(int flags, int nfd) {
-												fd-util: add new helper move_fd() and make use of it

We are using the same pattern at various places: call dup2() on an fd,
and close the old fd, usually in combination with some O_CLOEXEC
fiddling. Let's add a little helper for this, and port a few obvious
cases over.

											
										
										
											2017-10-26 18:45:54 +02:00
+								        int fd;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(nfd >= 0);
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        fd = open("/dev/null", flags|O_NOCTTY);
 								        if (fd < 0)
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
+								                return -errno;
-												fd-util: add new helper move_fd() and make use of it

We are using the same pattern at various places: call dup2() on an fd,
and close the old fd, usually in combination with some O_CLOEXEC
fiddling. Let's add a little helper for this, and port a few obvious
cases over.

											
										
										
											2017-10-26 18:45:54 +02:00
+								        return move_fd(fd, nfd, false);
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
+								}
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
-												execute: make some code shorter

Let's simplify some lines to make it shorter.

											
										
										
											2017-07-14 18:58:57 +02:00
+								        static const union sockaddr_union sa = {
-												Use initalization instead of explicit zeroing

Before, we would initialize many fields twice: first
by filling the structure with zeros, and then a second
time with the real values. We can let the compiler do
the job for us, avoiding one copy.

A downside of this patch is that text gets slightly
bigger. This is because all zero() calls are effectively
inlined:

$ size build/.libs/systemd
         text    data     bss     dec     hex filename
before 897737  107300    2560 1007597   f5fed build/.libs/systemd
after  897873  107300    2560 1007733   f6075 build/.libs/systemd

… actually less than 1‰.

A few asserts that the parameter is not null had to be removed. I
don't think this changes much, because first, it is quite unlikely
for the assert to fail, and second, an immediate SEGV is almost as
good as an assert.

											
										
										
											2013-03-25 00:59:00 +01:00
+								                .un.sun_family = AF_UNIX,
 								                .un.sun_path = "/run/systemd/journal/stdout",
 								        };
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								        uid_t olduid = UID_INVALID;
 								        gid_t oldgid = GID_INVALID;
 								        int r;
-												core, sd-bus, logind: make use of uid_is_valid() in more places

											
										
										
											2017-07-14 18:57:04 +02:00
+								        if (gid_is_valid(gid)) {
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                oldgid = getgid();
-												execute: make some code shorter

Let's simplify some lines to make it shorter.

											
										
										
											2017-07-14 18:58:57 +02:00
+								                if (setegid(gid) < 0)
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                        return -errno;
 								        }
-												core, sd-bus, logind: make use of uid_is_valid() in more places

											
										
										
											2017-07-14 18:57:04 +02:00
+								        if (uid_is_valid(uid)) {
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                olduid = getuid();
-												execute: make some code shorter

Let's simplify some lines to make it shorter.

											
										
										
											2017-07-14 18:58:57 +02:00
+								                if (seteuid(uid) < 0) {
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                        r = -errno;
 								                        goto restore_gid;
 								                }
 								        }
-												execute: make some code shorter

Let's simplify some lines to make it shorter.

											
										
										
											2017-07-14 18:58:57 +02:00
+								        r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
 								        /* If we fail to restore the uid or gid, things will likely
 								           fail later on. This should only happen if an LSM interferes. */
-												core, sd-bus, logind: make use of uid_is_valid() in more places

											
										
										
											2017-07-14 18:57:04 +02:00
+								        if (uid_is_valid(uid))
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                (void) seteuid(olduid);
 								 restore_gid:
-												core, sd-bus, logind: make use of uid_is_valid() in more places

											
										
										
											2017-07-14 18:57:04 +02:00
+								        if (gid_is_valid(gid))
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                (void) setegid(oldgid);
 								        return r;
 								}
-												execute: minor coding style improvements

											
										
										
											2016-06-14 16:50:35 +02:00
+								static int connect_logger_as(
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								                const Unit *unit,
-												execute: minor coding style improvements

											
										
										
											2016-06-14 16:50:35 +02:00
+								                const ExecContext *context,
-												execute: let's decouple execute.c a bit from the unit logic

Let's try to decouple the execution engine a bit from the Unit/Manager
concept, and hence pass one more flag as part of the ExecParameters flags
field.

											
										
										
											2017-08-01 10:28:20 +02:00
+								                const ExecParameters *params,
-												execute: minor coding style improvements

											
										
										
											2016-06-14 16:50:35 +02:00
+								                ExecOutput output,
 								                const char *ident,
 								                int nfd,
 								                uid_t uid,
 								                gid_t gid) {
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								        int fd, r;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
 								        assert(context);
-												execute: let's decouple execute.c a bit from the unit logic

Let's try to decouple the execution engine a bit from the Unit/Manager
concept, and hence pass one more flag as part of the ExecParameters flags
field.

											
										
										
											2017-08-01 10:28:20 +02:00
+								        assert(params);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(output < _EXEC_OUTPUT_MAX);
 								        assert(ident);
 								        assert(nfd >= 0);
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												execute: talk directly to the journald, instead to the stdout-syslog-bridge

											
										
										
											2012-01-05 21:39:08 +01:00
+								        fd = socket(AF_UNIX, SOCK_STREAM, 0);
 								        if (fd < 0)
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                return -errno;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								        r = connect_journal_socket(fd, uid, gid);
 								        if (r < 0)
 								                return r;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        if (shutdown(fd, SHUT_RD) < 0) {
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								                safe_close(fd);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                return -errno;
 								        }
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												execute: minor coding style improvements

											
										
										
											2016-06-14 16:50:35 +02:00
+								        (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
-												execute: also set SO_SNDBUF when spawning a service with stdout/stderr connected to journald

											
										
										
											2013-12-16 20:00:09 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        dprintf(fd,
-												journal: set the _SYSTEMD_UNIT field for messages from terminated processes

As described in

  https://bugs.freedesktop.org/show_bug.cgi?id=50184

the journal currently doesn't set fields such as _SYSTEMD_UNIT
properly for messages coming from processes that have already
terminated.  This means among other things that "systemctl status" may
not show some of the output of services that wrote messages just
before they exited.

This patch fixes this by having processes that log to the journal
write their unit identifier to journald when the connection to
/run/systemd/journal/stdout is opened.  Journald stores the unit ID
and uses it to fill in _SYSTEMD_UNIT when it cannot be obtained
normally (i.e. from the cgroup).  To prevent impersonating another
unit, this information is only used when the caller is root.

This doesn't fix the general problem of getting metadata about
messages from terminated processes (which requires some kernel
support), but it allows "systemctl status" and similar queries to do
the Right Thing for units that log via stdout/stderr.

											
										
										
											2012-06-21 22:40:47 +02:00
+								                "%s\n"
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                "%s\n"
 								                "%i\n"
-												execute: talk directly to the journald, instead to the stdout-syslog-bridge

											
										
										
											2012-01-05 21:39:08 +01:00
+								                "%i\n"
 								                "%i\n"
 								                "%i\n"
-												logger: support printk() style priority prefixes

											
										
										
											2010-05-16 01:46:35 +02:00
+								                "%i\n",
-												execute: don't pass unit ID in --user mode to journald for stream logging

When we create a log stream connection to journald, we pass along the
unit ID. With this change we do this only when we run as system
instance, not as user instance, to remove the ambiguity whether a user
or system unit is specified. The effect of this change is minor:
journald ignores the field anyway from clients with UID != 0. This patch
hence only fixes the unit attribution for the --user instance of the
root user.

											
										
										
											2017-07-14 18:59:41 +02:00
+								                context->syslog_identifier ?: ident,
-												execute: let's decouple execute.c a bit from the unit logic

Let's try to decouple the execution engine a bit from the Unit/Manager
concept, and hence pass one more flag as part of the ExecParameters flags
field.

											
										
										
											2017-08-01 10:28:20 +02:00
+								                params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
-												execute: talk directly to the journald, instead to the stdout-syslog-bridge

											
										
										
											2012-01-05 21:39:08 +01:00
+								                context->syslog_priority,
 								                !!context->syslog_level_prefix,
-												execute: minor ExecOutput handling beautification (#6711)

Let's clean up the checking for the various ExecOutput values a bit,
let's use IN_SET everywhere, and the same concepts for all three bools
we pass to dprintf().
											
										
										
											2017-09-01 02:04:27 +02:00
+								                is_syslog_output(output),
 								                is_kmsg_output(output),
-												core/execute: add internal is_terminal_output()

Similar to already existing is_terminal_input().

Note that the only current user (connect_logger_as) is never called
for EXEC_OUTPUT_TTY, so it won't mind whether we accept it.

											
										
										
											2013-02-28 01:35:47 +01:00
+								                is_terminal_output(output));
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												fd-util: add new helper move_fd() and make use of it

We are using the same pattern at various places: call dup2() on an fd,
and close the old fd, usually in combination with some O_CLOEXEC
fiddling. Let's add a little helper for this, and port a few obvious
cases over.

											
										
										
											2017-10-26 18:45:54 +02:00
+								        return move_fd(fd, nfd, false);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								}
-												execute: fix type of open_terminal_as() flags parameter

It's the flags parameter we propagate here, not the mode parameter,
hence let's name it properly, and use the right type.

											
										
										
											2017-10-27 14:32:22 +02:00
+								static int open_terminal_as(const char *path, int flags, int nfd) {
-												fd-util: add new helper move_fd() and make use of it

We are using the same pattern at various places: call dup2() on an fd,
and close the old fd, usually in combination with some O_CLOEXEC
fiddling. Let's add a little helper for this, and port a few obvious
cases over.

											
										
										
											2017-10-26 18:45:54 +02:00
+								        int fd;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(path);
 								        assert(nfd >= 0);
-												execute: minor coding style improvements

											
										
										
											2016-06-14 16:50:35 +02:00
-												execute: fix type of open_terminal_as() flags parameter

It's the flags parameter we propagate here, not the mode parameter,
hence let's name it properly, and use the right type.

											
										
										
											2017-10-27 14:32:22 +02:00
+								        fd = open_terminal(path, flags | O_NOCTTY);
-												tree-wide: don't do assignments within if checks

Turn this:

       if ((r = foo()) < 0) { ...

into this:

       r = foo();
       if (r < 0) { ...

											
										
										
											2015-09-08 19:14:10 +02:00
+								        if (fd < 0)
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                return fd;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												fd-util: add new helper move_fd() and make use of it

We are using the same pattern at various places: call dup2() on an fd,
and close the old fd, usually in combination with some O_CLOEXEC
fiddling. Let's add a little helper for this, and port a few obvious
cases over.

											
										
										
											2017-10-26 18:45:54 +02:00
+								        return move_fd(fd, nfd, false);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								}
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								static int acquire_path(const char *path, int flags, mode_t mode) {
 								        union sockaddr_union sa = {
 								                .sa.sa_family = AF_UNIX,
 								        };
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        int fd, r;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(path);
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								        if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 								                flags |= O_CREAT;
 								        fd = open(path, flags|O_NOCTTY, mode);
 								        if (fd >= 0)
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                return fd;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								        if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 								                return -errno;
 								        if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 								                return -ENXIO;
 								        /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 								        fd = socket(AF_UNIX, SOCK_STREAM, 0);
 								        if (fd < 0)
 								                return -errno;
 								        strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
 								        if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								                safe_close(fd);
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								                return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 								                                                           * indication that his wasn't an AF_UNIX socket after all */
 								        }
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								        if ((flags & O_ACCMODE) == O_RDONLY)
 								                r = shutdown(fd, SHUT_WR);
 								        else if ((flags & O_ACCMODE) == O_WRONLY)
 								                r = shutdown(fd, SHUT_RD);
 								        else
 								                return fd;
 								        if (r < 0) {
 								                safe_close(fd);
 								                return -errno;
 								        }
 								        return fd;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								}
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												core: add two new unit file settings: StandardInputData= + StandardInputText=

Both permit configuring data to pass through STDIN to an invoked
process. StandardInputText= accepts a line of text (possibly with
embedded C-style escapes as well as unit specifiers), which is appended
to the buffer to pass as stdin, followed by a single newline.
StandardInputData= is similar, but accepts arbitrary base64 encoded
data, and will not resolve specifiers or C-style escapes, nor append
newlines.

This may be used to pass input/configuration data to services, directly
in-line from unit files, either in a cooked or in a more raw format.

											
										
										
											2017-10-27 11:33:05 +02:00
+								static int fixup_input(
 								                const ExecContext *context,
 								                int socket_fd,
 								                bool apply_tty_stdin) {
 								        ExecInput std_input;
 								        assert(context);
 								        std_input = context->std_input;
-												execute: if the main process of a service already owns the TTY, don't wait for acquiring it again in the reload/stop step

											
										
										
											2010-07-08 04:09:17 +02:00
 								        if (is_terminal_input(std_input) && !apply_tty_stdin)
 								                return EXEC_INPUT_NULL;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												execute: simplify stdin/stderr/stdout fixup a little

											
										
										
											2010-05-19 21:50:34 +02:00
+								        if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								                return EXEC_INPUT_NULL;
-												core: add two new unit file settings: StandardInputData= + StandardInputText=

Both permit configuring data to pass through STDIN to an invoked
process. StandardInputText= accepts a line of text (possibly with
embedded C-style escapes as well as unit specifiers), which is appended
to the buffer to pass as stdin, followed by a single newline.
StandardInputData= is similar, but accepts arbitrary base64 encoded
data, and will not resolve specifiers or C-style escapes, nor append
newlines.

This may be used to pass input/configuration data to services, directly
in-line from unit files, either in a cooked or in a more raw format.

											
										
										
											2017-10-27 11:33:05 +02:00
+								        if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 								                return EXEC_INPUT_NULL;
-												execute: simplify stdin/stderr/stdout fixup a little

											
										
										
											2010-05-19 21:50:34 +02:00
+								        return std_input;
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								}
-												execute: simplify stdin/stderr/stdout fixup a little

											
										
										
											2010-05-19 21:50:34 +02:00
+								static int fixup_output(ExecOutput std_output, int socket_fd) {
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
-												execute: simplify stdin/stderr/stdout fixup a little

											
										
										
											2010-05-19 21:50:34 +02:00
+								        if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								                return EXEC_OUTPUT_INHERIT;
-												execute: simplify stdin/stderr/stdout fixup a little

											
										
										
											2010-05-19 21:50:34 +02:00
+								        return std_output;
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								}
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								static int setup_input(
 								                const ExecContext *context,
 								                const ExecParameters *params,
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                int socket_fd,
 								                int named_iofds[3]) {
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								        ExecInput i;
 								        assert(context);
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								        assert(params);
 								        if (params->stdin_fd >= 0) {
 								                if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 								                        return -errno;
 								                /* Try to make this the controlling tty, if it is a tty, and reset it */
-												execute: check whether we are actually on a TTY before doing TIOCSCTTY

Given that Linux assigns the same ioctl numbers ot multiple subsystems,
we should be careful when invoking ioctls, so that we don't end up
calling something we wouldn't want to call.

											
										
										
											2017-10-26 18:47:34 +02:00
+								                if (isatty(STDIN_FILENO)) {
 								                        (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 								                        (void) reset_terminal_fd(STDIN_FILENO, true);
 								                }
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
 								                return STDIN_FILENO;
 								        }
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
-												core: add two new unit file settings: StandardInputData= + StandardInputText=

Both permit configuring data to pass through STDIN to an invoked
process. StandardInputText= accepts a line of text (possibly with
embedded C-style escapes as well as unit specifiers), which is appended
to the buffer to pass as stdin, followed by a single newline.
StandardInputData= is similar, but accepts arbitrary base64 encoded
data, and will not resolve specifiers or C-style escapes, nor append
newlines.

This may be used to pass input/configuration data to services, directly
in-line from unit files, either in a cooked or in a more raw format.

											
										
										
											2017-10-27 11:33:05 +02:00
+								        i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
 								        switch (i) {
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        case EXEC_INPUT_NULL:
 								                return open_null_as(O_RDONLY, STDIN_FILENO);
 								        case EXEC_INPUT_TTY:
 								        case EXEC_INPUT_TTY_FORCE:
 								        case EXEC_INPUT_TTY_FAIL: {
-												fd-util: add new helper move_fd() and make use of it

We are using the same pattern at various places: call dup2() on an fd,
and close the old fd, usually in combination with some O_CLOEXEC
fiddling. Let's add a little helper for this, and port a few obvious
cases over.

											
										
										
											2017-10-26 18:45:54 +02:00
+								                int fd;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                fd = acquire_terminal(exec_context_tty_path(context),
-												terminal-util: rework acquire_terminal()

This modernizes acquire_terminal() in a couple of ways:

1. The three boolean arguments are replaced by a flags parameter, that
   should be more descriptive in what it does.

2. We now properly handle inotify queue overruns

3. We use _cleanup_ for closing the fds now, to shorten the code quite a
   bit.

Behaviour should not be altered by this.

											
										
										
											2018-02-13 21:24:37 +01:00
+								                                      i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 								                                      i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 								                                                                  ACQUIRE_TERMINAL_WAIT,
-												time-util: add and use USEC/NSEC_INFINIY

											
										
										
											2014-07-29 12:23:31 +02:00
+								                                      USEC_INFINITY);
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                if (fd < 0)
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                        return fd;
-												fd-util: add new helper move_fd() and make use of it

We are using the same pattern at various places: call dup2() on an fd,
and close the old fd, usually in combination with some O_CLOEXEC
fiddling. Let's add a little helper for this, and port a few obvious
cases over.

											
										
										
											2017-10-26 18:45:54 +02:00
+								                return move_fd(fd, STDIN_FILENO, false);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        }
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								        case EXEC_INPUT_SOCKET:
-												execute: some extra asserts

In some cases we checked for fd validity already explicitly, let's do
this for all our fds.

											
										
										
											2017-10-27 14:59:05 +02:00
+								                assert(socket_fd >= 0);
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								                return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        case EXEC_INPUT_NAMED_FD:
-												execute: some extra asserts

In some cases we checked for fd validity already explicitly, let's do
this for all our fds.

											
										
										
											2017-10-27 14:59:05 +02:00
+								                assert(named_iofds[STDIN_FILENO] >= 0);
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 								                return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
-												core: add two new unit file settings: StandardInputData= + StandardInputText=

Both permit configuring data to pass through STDIN to an invoked
process. StandardInputText= accepts a line of text (possibly with
embedded C-style escapes as well as unit specifiers), which is appended
to the buffer to pass as stdin, followed by a single newline.
StandardInputData= is similar, but accepts arbitrary base64 encoded
data, and will not resolve specifiers or C-style escapes, nor append
newlines.

This may be used to pass input/configuration data to services, directly
in-line from unit files, either in a cooked or in a more raw format.

											
										
										
											2017-10-27 11:33:05 +02:00
+								        case EXEC_INPUT_DATA: {
 								                int fd;
 								                fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 								                if (fd < 0)
 								                        return fd;
 								                return move_fd(fd, STDIN_FILENO, false);
 								        }
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								        case EXEC_INPUT_FILE: {
 								                bool rw;
 								                int fd;
 								                assert(context->stdio_file[STDIN_FILENO]);
 								                rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 								                        (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 								                fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 								                if (fd < 0)
 								                        return fd;
 								                return move_fd(fd, STDIN_FILENO, false);
 								        }
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        default:
 								                assert_not_reached("Unknown input type");
 								        }
 								}
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								static int setup_output(
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								                const Unit *unit,
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								                const ExecContext *context,
 								                const ExecParameters *params,
 								                int fileno,
 								                int socket_fd,
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                int named_iofds[3],
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								                const char *ident,
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								                uid_t uid,
 								                gid_t gid,
 								                dev_t *journal_stream_dev,
 								                ino_t *journal_stream_ino) {
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								        ExecOutput o;
 								        ExecInput i;
-												execute: robustness against journald failures

Almost every unit logs to the journal. If journald gets a permanent
failure, units would not be able to start (exit code 209/STDOUT).

Add a fallback to /dev/null to avoid making the system entirely
unusable in such a case.

											
										
										
											2013-02-15 22:43:23 +01:00
+								        int r;
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        assert(unit);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(context);
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								        assert(params);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(ident);
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								        assert(journal_stream_dev);
 								        assert(journal_stream_ino);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								        if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 								                if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 								                        return -errno;
 								                return STDOUT_FILENO;
 								        }
 								        if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 								                if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 								                        return -errno;
 								                return STDERR_FILENO;
 								        }
-												core: add two new unit file settings: StandardInputData= + StandardInputText=

Both permit configuring data to pass through STDIN to an invoked
process. StandardInputText= accepts a line of text (possibly with
embedded C-style escapes as well as unit specifiers), which is appended
to the buffer to pass as stdin, followed by a single newline.
StandardInputData= is similar, but accepts arbitrary base64 encoded
data, and will not resolve specifiers or C-style escapes, nor append
newlines.

This may be used to pass input/configuration data to services, directly
in-line from unit files, either in a cooked or in a more raw format.

											
										
										
											2017-10-27 11:33:05 +02:00
+								        i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
-												execute: simplify stdin/stderr/stdout fixup a little

											
										
										
											2010-05-19 21:50:34 +02:00
+								        o = fixup_output(context->std_output, socket_fd);
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								        if (fileno == STDERR_FILENO) {
 								                ExecOutput e;
 								                e = fixup_output(context->std_error, socket_fd);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                /* This expects the input and output are already set up */
 								                /* Don't change the stderr file descriptor if we inherit all
 								                 * the way and are not on a tty */
 								                if (e == EXEC_OUTPUT_INHERIT &&
 								                    o == EXEC_OUTPUT_INHERIT &&
 								                    i == EXEC_INPUT_NULL &&
 								                    !is_terminal_input(context->std_input) &&
 								                    getppid () != 1)
 								                        return fileno;
 								                /* Duplicate from stdout if possible */
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                        return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                o = e;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								        } else if (o == EXEC_OUTPUT_INHERIT) {
-												execute: inherit from original input, not the fixed up

											
										
										
											2010-07-12 22:04:59 +02:00
+								                /* If input got downgraded, inherit the original value */
 								                if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                        return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
-												execute: inherit from original input, not the fixed up

											
										
										
											2010-07-12 22:04:59 +02:00
-												core: add two new unit file settings: StandardInputData= + StandardInputText=

Both permit configuring data to pass through STDIN to an invoked
process. StandardInputText= accepts a line of text (possibly with
embedded C-style escapes as well as unit specifiers), which is appended
to the buffer to pass as stdin, followed by a single newline.
StandardInputData= is similar, but accepts arbitrary base64 encoded
data, and will not resolve specifiers or C-style escapes, nor append
newlines.

This may be used to pass input/configuration data to services, directly
in-line from unit files, either in a cooked or in a more raw format.

											
										
										
											2017-10-27 11:33:05 +02:00
+								                /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 								                if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                        return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												execute: change stdout inherit logic, when run as PID 1 use /dev/null

											
										
										
											2010-07-07 04:37:42 +02:00
+								                /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 								                if (getppid() != 1)
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                        return fileno;
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                /* We need to open /dev/null here anew, to get the right access mode. */
 								                return open_null_as(O_WRONLY, fileno);
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								        switch (o) {
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								        case EXEC_OUTPUT_NULL:
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                return open_null_as(O_WRONLY, fileno);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								        case EXEC_OUTPUT_TTY:
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								                if (is_terminal_input(i))
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                        return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								                /* We don't reset the terminal if this is just about output */
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								        case EXEC_OUTPUT_SYSLOG:
-												execute: optionally forward program output to /dev/console in addition to syslog/kmsg

											
										
										
											2011-02-15 01:27:53 +01:00
+								        case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
-												execute: s/EXEC_OUTPUT_KERNEL/EXEC_OUTPUT_KMSG/ to follow LOG_TARGET_xxx nomenclature

											
										
										
											2010-05-19 21:49:03 +02:00
+								        case EXEC_OUTPUT_KMSG:
-												execute: optionally forward program output to /dev/console in addition to syslog/kmsg

											
										
										
											2011-02-15 01:27:53 +01:00
+								        case EXEC_OUTPUT_KMSG_AND_CONSOLE:
-												journal: introduce log target 'journal' for executed processes

											
										
										
											2012-01-05 23:54:45 +01:00
+								        case EXEC_OUTPUT_JOURNAL:
 								        case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
-												execute: let's decouple execute.c a bit from the unit logic

Let's try to decouple the execution engine a bit from the Unit/Manager
concept, and hence pass one more flag as part of the ExecParameters flags
field.

											
										
										
											2017-08-01 10:28:20 +02:00
+								                r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
-												execute: robustness against journald failures

Almost every unit logs to the journal. If journald gets a permanent
failure, units would not be able to start (exit code 209/STDOUT).

Add a fallback to /dev/null to avoid making the system entirely
unusable in such a case.

											
										
										
											2013-02-15 22:43:23 +01:00
+								                if (r < 0) {
-												execute: downgrade a log message ERR → WARNING, since we proceed ignoring its result

											
										
										
											2017-09-26 17:42:17 +02:00
+								                        log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                        r = open_null_as(O_WRONLY, fileno);
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								                } else {
 								                        struct stat st;
 								                        /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 								                         * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
-												core: make sure that $JOURNAL_STREAM prefers stderr over stdout information (#6824)

If two separate log streams are connected to stdout and stderr, let's
make sure $JOURNAL_STREAM points to the latter, as that's the preferred
log destination, and the environment variable has been created in order
to permit services to automatically upgrade from stderr based logging to
native journal logging.

Also, document this behaviour.

Fixes: #6800
											
										
										
											2017-09-15 08:26:38 +02:00
+								                         * services to detect whether they are connected to the journal or not.
 								                         *
 								                         * If both stdout and stderr are connected to a stream then let's make sure to store the data
 								                         * about STDERR as that's usually the best way to do logging. */
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
-												core: make sure that $JOURNAL_STREAM prefers stderr over stdout information (#6824)

If two separate log streams are connected to stdout and stderr, let's
make sure $JOURNAL_STREAM points to the latter, as that's the preferred
log destination, and the environment variable has been created in order
to permit services to automatically upgrade from stderr based logging to
native journal logging.

Also, document this behaviour.

Fixes: #6800
											
										
										
											2017-09-15 08:26:38 +02:00
+								                        if (fstat(fileno, &st) >= 0 &&
 								                            (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								                                *journal_stream_dev = st.st_dev;
 								                                *journal_stream_ino = st.st_ino;
 								                        }
-												execute: robustness against journald failures

Almost every unit logs to the journal. If journald gets a permanent
failure, units would not be able to start (exit code 209/STDOUT).

Add a fallback to /dev/null to avoid making the system entirely
unusable in such a case.

											
										
										
											2013-02-15 22:43:23 +01:00
+								                }
 								                return r;
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
 								        case EXEC_OUTPUT_SOCKET:
 								                assert(socket_fd >= 0);
-												execute: some extra asserts

In some cases we checked for fd validity already explicitly, let's do
this for all our fds.

											
										
										
											2017-10-27 14:59:05 +02:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        case EXEC_OUTPUT_NAMED_FD:
-												execute: some extra asserts

In some cases we checked for fd validity already explicitly, let's do
this for all our fds.

											
										
										
											2017-10-27 14:59:05 +02:00
+								                assert(named_iofds[fileno] >= 0);
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                (void) fd_nonblock(named_iofds[fileno], false);
 								                return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
-												Add support for opening files for appending

Addresses part of #8983

											
										
										
											2018-07-03 21:22:29 +02:00
+								        case EXEC_OUTPUT_FILE:
 								        case EXEC_OUTPUT_FILE_APPEND: {
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								                bool rw;
-												Add support for opening files for appending

Addresses part of #8983

											
										
										
											2018-07-03 21:22:29 +02:00
+								                int fd, flags;
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
 								                assert(context->stdio_file[fileno]);
 								                rw = context->std_input == EXEC_INPUT_FILE &&
 								                        streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 								                if (rw)
 								                        return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
-												Add support for opening files for appending

Addresses part of #8983

											
										
										
											2018-07-03 21:22:29 +02:00
+								                flags = O_WRONLY;
 								                if (o == EXEC_OUTPUT_FILE_APPEND)
 								                        flags |= O_APPEND;
 								                fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								                if (fd < 0)
 								                        return fd;
-												Add support for opening files for appending

Addresses part of #8983

											
										
										
											2018-07-03 21:22:29 +02:00
+								                return move_fd(fd, fileno, 0);
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        default:
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                assert_not_reached("Unknown error type");
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        }
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
+								}
-												execute: chown() the tty when running owning them

											
										
										
											2010-04-13 18:50:43 +02:00
+								static int chown_terminal(int fd, uid_t uid) {
 								        struct stat st;
 								        assert(fd >= 0);
-												execute: check whether the specified fd is a tty before chowning/chmoding  it (#3457)

Let's add an extra safety check before we chmod/chown a TTY to the right user,
as we might end up having connected something to STDIN/STDOUT that is actually
not a TTY, even though this might have been requested, due to permissive
StandardInput= settings or transient service activation with fds passed in.

Fixes:

https://bugs.freedesktop.org/show_bug.cgi?id=85255
											
										
										
											2016-06-09 10:01:16 +02:00
+								        /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 								        if (isatty(fd) < 1)
 								                return 0;
-												execute: chown() the tty when running owning them

											
										
										
											2010-04-13 18:50:43 +02:00
+								        /* This might fail. What matters are the results. */
-												make gcc shut up

											
										
										
											2010-05-10 03:34:31 +02:00
+								        (void) fchown(fd, uid, -1);
 								        (void) fchmod(fd, TTY_MODE);
-												execute: chown() the tty when running owning them

											
										
										
											2010-04-13 18:50:43 +02:00
 								        if (fstat(fd, &st) < 0)
 								                return -errno;
-												execute: fix terminal chowning logic

											
										
										
											2010-04-13 21:13:49 +02:00
+								        if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
-												execute: chown() the tty when running owning them

											
										
										
											2010-04-13 18:50:43 +02:00
+								                return -EPERM;
 								        return 0;
 								}
-												core: allow to redirect confirmation messages to a different console

It's rather hard to parse the confirmation messages (enabled with
systemd.confirm_spawn=true) amongst the status messages and the kernel
ones (if enabled).

This patch gives the possibility to the user to redirect the confirmation
message to a different virtual console, either by giving its name or its path,
so those messages are separated from the other ones and easier to read.

											
										
										
											2016-11-02 10:38:22 +01:00
+								static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 								        int r;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								        assert(_saved_stdin);
 								        assert(_saved_stdout);
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 								        if (saved_stdin < 0)
 								                return -errno;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        if (saved_stdout < 0)
 								                return -errno;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												terminal-util: rework acquire_terminal()

This modernizes acquire_terminal() in a couple of ways:

1. The three boolean arguments are replaced by a flags parameter, that
   should be more descriptive in what it does.

2. We now properly handle inotify queue overruns

3. We use _cleanup_ for closing the fds now, to shorten the code quite a
   bit.

Behaviour should not be altered by this.

											
										
										
											2018-02-13 21:24:37 +01:00
+								        fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        if (fd < 0)
 								                return fd;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        r = chown_terminal(fd, getuid());
 								        if (r < 0)
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								                return r;
-												execute: chown() the tty when running owning them

											
										
										
											2010-04-13 18:50:43 +02:00
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        r = reset_terminal_fd(fd, true);
 								        if (r < 0)
 								                return r;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												tree-wide: port various places over to use new rearrange_stdio()

											
										
										
											2018-02-28 23:32:49 +01:00
+								        r = rearrange_stdio(fd, fd, STDERR_FILENO);
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        fd = -1;
-												tree-wide: port various places over to use new rearrange_stdio()

											
										
										
											2018-02-28 23:32:49 +01:00
+								        if (r < 0)
 								                return r;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								        *_saved_stdin = saved_stdin;
 								        *_saved_stdout = saved_stdout;
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        saved_stdin = saved_stdout = -1;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        return 0;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								}
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								static void write_confirm_error_fd(int err, int fd, const Unit *u) {
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        assert(err < 0);
 								        if (err == -ETIMEDOUT)
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								                dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        else {
 								                errno = -err;
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								                dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        }
 								}
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								static void write_confirm_error(int err, const char *vc, const Unit *u) {
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								        _cleanup_close_ int fd = -1;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        assert(vc);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: allow to redirect confirmation messages to a different console

It's rather hard to parse the confirmation messages (enabled with
systemd.confirm_spawn=true) amongst the status messages and the kernel
ones (if enabled).

This patch gives the possibility to the user to redirect the confirmation
message to a different virtual console, either by giving its name or its path,
so those messages are separated from the other ones and easier to read.

											
										
										
											2016-11-02 10:38:22 +01:00
+								        fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        if (fd < 0)
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                return;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								        write_confirm_error_fd(err, fd, u);
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								}
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        int r = 0;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        assert(saved_stdin);
 								        assert(saved_stdout);
 								        release_terminal();
 								        if (*saved_stdin >= 0)
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                if (dup2(*saved_stdin, STDIN_FILENO) < 0)
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								                        r = -errno;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        if (*saved_stdout >= 0)
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								                        r = -errno;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        *saved_stdin = safe_close(*saved_stdin);
 								        *saved_stdout = safe_close(*saved_stdout);
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
 								        return r;
 								}
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								enum {
 								        CONFIRM_PRETEND_FAILURE = -1,
 								        CONFIRM_PRETEND_SUCCESS =  0,
 								        CONFIRM_EXECUTE = 1,
 								};
-												core: add 'i' in confirm spawn to give a short summary of the unit to spawn

											
										
										
											2016-11-12 14:55:12 +01:00
+								static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        int saved_stdout = -1, saved_stdin = -1, r;
-												core: limit the length of the confirmation question

When "confirmation_spawn=1", the confirmation question can look like:

  Execute /usr/bin/kmod static-nodes --format=tmpfiles --output=/run/tmpfiles.d/kmod.conf? [Yes, No, Skip]

which is pretty verbose and might not fit in the console width size (which is
usually 80 chars) and thus question will be splitted into 2 consecutive lines.

However since the question is now refreshed every 2 secs, the reprinted
question will overwrite the second line of the previous one...

To prevent this, this patch makes sure that the command line won't be longer
than 60 chars by ellipsizing it if the command is longer:

  Execute /usr/bin/kmod static-nodes --format=tmpfiles --output=/ru…nf? [Yes, No, View, Skip]

A following patch will introduce a new choice that will allow the user to get
details on the command to be executed so it will still be possible to see the
full command line.

											
										
										
											2016-11-07 17:14:59 +01:00
+								        _cleanup_free_ char *e = NULL;
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        char c;
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        /* For any internal errors, assume a positive response. */
-												core: allow to redirect confirmation messages to a different console

It's rather hard to parse the confirmation messages (enabled with
systemd.confirm_spawn=true) amongst the status messages and the kernel
ones (if enabled).

This patch gives the possibility to the user to redirect the confirmation
message to a different virtual console, either by giving its name or its path,
so those messages are separated from the other ones and easier to read.

											
										
										
											2016-11-02 10:38:22 +01:00
+								        r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        if (r < 0) {
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								                write_confirm_error(r, vc, u);
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                return CONFIRM_EXECUTE;
 								        }
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
-												core: add 'c' in confirmation_spawn to resume the boot process

											
										
										
											2016-11-15 09:29:04 +01:00
+								        /* confirm_spawn might have been disabled while we were sleeping. */
 								        if (manager_is_confirm_spawn_disabled(u->manager)) {
 								                r = 1;
 								                goto restore_stdio;
 								        }
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
-												core: limit the length of the confirmation question

When "confirmation_spawn=1", the confirmation question can look like:

  Execute /usr/bin/kmod static-nodes --format=tmpfiles --output=/run/tmpfiles.d/kmod.conf? [Yes, No, Skip]

which is pretty verbose and might not fit in the console width size (which is
usually 80 chars) and thus question will be splitted into 2 consecutive lines.

However since the question is now refreshed every 2 secs, the reprinted
question will overwrite the second line of the previous one...

To prevent this, this patch makes sure that the command line won't be longer
than 60 chars by ellipsizing it if the command is longer:

  Execute /usr/bin/kmod static-nodes --format=tmpfiles --output=/ru…nf? [Yes, No, View, Skip]

A following patch will introduce a new choice that will allow the user to get
details on the command to be executed so it will still be possible to see the
full command line.

											
										
										
											2016-11-07 17:14:59 +01:00
+								        e = ellipsize(cmdline, 60, 100);
 								        if (!e) {
 								                log_oom();
 								                r = CONFIRM_EXECUTE;
 								                goto restore_stdio;
 								        }
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								        for (;;) {
-												core: in confirm spawn, suggest 'f' when user selects 'n' choice

											
										
										
											2016-11-17 18:22:43 +01:00
+								                r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                if (r < 0) {
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								                        write_confirm_error_fd(r, STDOUT_FILENO, u);
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                        r = CONFIRM_EXECUTE;
 								                        goto restore_stdio;
 								                }
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                switch (c) {
-												core: add 'c' in confirmation_spawn to resume the boot process

											
										
										
											2016-11-15 09:29:04 +01:00
+								                case 'c':
 								                        printf("Resuming normal execution.\n");
 								                        manager_disable_confirm_spawn();
 								                        r = 1;
 								                        break;
-												core: add 'D' in confirmat spawn to show a full dump of the unit to spawn

											
										
										
											2016-11-12 15:08:29 +01:00
+								                case 'D':
 								                        unit_dump(u, stdout, "  ");
 								                        continue; /* ask again */
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                case 'f':
 								                        printf("Failing execution.\n");
 								                        r = CONFIRM_PRETEND_FAILURE;
 								                        break;
 								                case 'h':
-												core: add 'c' in confirmation_spawn to resume the boot process

											
										
										
											2016-11-15 09:29:04 +01:00
+								                        printf("  c - continue, proceed without asking anymore\n"
 								                               "  D - dump, show the state of the unit\n"
-												core: add 'D' in confirmat spawn to show a full dump of the unit to spawn

											
										
										
											2016-11-12 15:08:29 +01:00
+								                               "  f - fail, don't execute the command and pretend it failed\n"
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                               "  h - help\n"
-												core: add 'i' in confirm spawn to give a short summary of the unit to spawn

											
										
										
											2016-11-12 14:55:12 +01:00
+								                               "  i - info, show a short summary of the unit\n"
-												core: add 'j' in confirmation_spawn to list the jobs that are in progress

											
										
										
											2016-11-13 16:28:04 +01:00
+								                               "  j - jobs, show jobs that are in progress\n"
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                               "  s - skip, don't execute the command and pretend it succeeded\n"
 								                               "  y - yes, execute the command\n");
-												core: add 'D' in confirmat spawn to show a full dump of the unit to spawn

											
										
										
											2016-11-12 15:08:29 +01:00
+								                        continue; /* ask again */
-												core: add 'i' in confirm spawn to give a short summary of the unit to spawn

											
										
										
											2016-11-12 14:55:12 +01:00
+								                case 'i':
 								                        printf("  Description: %s\n"
 								                               "  Unit:        %s\n"
 								                               "  Command:     %s\n",
 								                               u->id, u->description, cmdline);
 								                        continue; /* ask again */
-												core: add 'j' in confirmation_spawn to list the jobs that are in progress

											
										
										
											2016-11-13 16:28:04 +01:00
+								                case 'j':
 								                        manager_dump_jobs(u->manager, stdout, "  ");
 								                        continue; /* ask again */
-												core: in confirm spawn, suggest 'f' when user selects 'n' choice

											
										
										
											2016-11-17 18:22:43 +01:00
+								                case 'n':
 								                        /* 'n' was removed in favor of 'f'. */
 								                        printf("Didn't understand 'n', did you mean 'f'?\n");
 								                        continue; /* ask again */
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                case 's':
 								                        printf("Skipping execution.\n");
 								                        r = CONFIRM_PRETEND_SUCCESS;
 								                        break;
 								                case 'y':
 								                        r = CONFIRM_EXECUTE;
 								                        break;
 								                default:
 								                        assert_not_reached("Unhandled choice");
 								                }
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                break;
 								        }
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								restore_stdio:
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        restore_confirm_stdio(&saved_stdin, &saved_stdout);
 								        return r;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								}
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								static int get_fixed_user(const ExecContext *c, const char **user,
 								                          uid_t *uid, gid_t *gid,
 								                          const char **home, const char **shell) {
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
+								        int r;
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        const char *name;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        assert(c);
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												Revert "core/execute: set HOME, USER also for root users"

This reverts commit 8b89628a10af3863bfc97872912e9da4076a5929.

This broke #5246

											
										
										
											2017-02-09 11:43:44 +01:00
+								        if (!c->user)
 								                return 0;
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 								         * (i.e. are "/" or "/bin/nologin"). */
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												Revert "core/execute: set HOME, USER also for root users"

This reverts commit 8b89628a10af3863bfc97872912e9da4076a5929.

This broke #5246

											
										
										
											2017-02-09 11:43:44 +01:00
+								        name = c->user;
-												user-util: rework get_user_creds()

Let's fold get_user_creds_clean() into get_user_creds(), and introduce a
flags argument for it to select "clean" behaviour. This flags parameter
also learns to other new flags:

- USER_CREDS_SYNTHESIZE_FALLBACK: in this mode the user records for
  root/nobody are only synthesized as fallback. Normally, the synthesized
  records take precedence over what is in the user database.  With this
  flag set this is reversed, and the user database takes precedence, and
  the synthesized records are only used if they are missing there. This
  flag should be set in cases where doing NSS is deemed safe, and where
  there's interest in knowing the correct shell, for example if the
  admin changed root's shell to zsh or suchlike.

- USER_CREDS_ALLOW_MISSING: if set, and a UID/GID is specified by
  numeric value, and there's no user/group record for it accept it
  anyway. This allows us to fix #9767

This then also ports all users to set the most appropriate flags.

Fixes: #9767

[zj: remove one isempty() call]

											
										
										
											2018-08-02 18:36:47 +02:00
+								        r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        if (r < 0)
 								                return r;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        *user = name;
 								        return 0;
 								}
 								static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 								        int r;
 								        const char *name;
 								        assert(c);
 								        if (!c->group)
 								                return 0;
 								        name = c->group;
-												user-util: rework get_user_creds()

Let's fold get_user_creds_clean() into get_user_creds(), and introduce a
flags argument for it to select "clean" behaviour. This flags parameter
also learns to other new flags:

- USER_CREDS_SYNTHESIZE_FALLBACK: in this mode the user records for
  root/nobody are only synthesized as fallback. Normally, the synthesized
  records take precedence over what is in the user database.  With this
  flag set this is reversed, and the user database takes precedence, and
  the synthesized records are only used if they are missing there. This
  flag should be set in cases where doing NSS is deemed safe, and where
  there's interest in knowing the correct shell, for example if the
  admin changed root's shell to zsh or suchlike.

- USER_CREDS_ALLOW_MISSING: if set, and a UID/GID is specified by
  numeric value, and there's no user/group record for it accept it
  anyway. This allows us to fix #9767

This then also ports all users to set the most appropriate flags.

Fixes: #9767

[zj: remove one isempty() call]

											
										
										
											2018-08-02 18:36:47 +02:00
+								        r = get_group_creds(&name, gid, 0);
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        if (r < 0)
 								                return r;
 								        *group = name;
 								        return 0;
 								}
-												core: intialize user aux groups and SupplementaryGroups= when DynamicUser= is set

Make sure that when DynamicUser= is set that we intialize the user
supplementary groups and that we also support SupplementaryGroups=

Fixes: https://github.com/systemd/systemd/issues/4539

Thanks Evgeny Vereshchagin (@evverx)

											
										
										
											2016-11-02 22:42:40 +01:00
+								static int get_supplementary_groups(const ExecContext *c, const char *user,
 								                                    const char *group, gid_t gid,
 								                                    gid_t **supplementary_gids, int *ngids) {
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        char **i;
 								        int r, k = 0;
 								        int ngroups_max;
 								        bool keep_groups = false;
 								        gid_t *groups = NULL;
 								        _cleanup_free_ gid_t *l_gids = NULL;
 								        assert(c);
-												core: initialize groups list before checking SupplementaryGroups= of a unit (#4533)

Always initialize the supplementary groups of caller before checking the
unit SupplementaryGroups= option.

Fixes https://github.com/systemd/systemd/issues/4531
											
										
										
											2016-11-02 17:51:35 +01:00
+								        /*
 								         * If user is given, then lookup GID and supplementary groups list.
 								         * We avoid NSS lookups for gid=0. Also we have to initialize groups
-												core: intialize user aux groups and SupplementaryGroups= when DynamicUser= is set

Make sure that when DynamicUser= is set that we intialize the user
supplementary groups and that we also support SupplementaryGroups=

Fixes: https://github.com/systemd/systemd/issues/4539

Thanks Evgeny Vereshchagin (@evverx)

											
										
										
											2016-11-02 22:42:40 +01:00
+								         * here and as early as possible so we keep the list of supplementary
 								         * groups of the caller.
-												core: initialize groups list before checking SupplementaryGroups= of a unit (#4533)

Always initialize the supplementary groups of caller before checking the
unit SupplementaryGroups= option.

Fixes https://github.com/systemd/systemd/issues/4531
											
										
										
											2016-11-02 17:51:35 +01:00
+								         */
 								        if (user && gid_is_valid(gid) && gid != 0) {
 								                /* First step, initialize groups from /etc/groups */
 								                if (initgroups(user, gid) < 0)
 								                        return -errno;
 								                keep_groups = true;
 								        }
-												core: use strv_isempty to check if supplementary_groups is empty

With the previous commit, we know that it will be NULL if empty, but
it's safe to always use strv_isempty() in case the code changes
in the future.

											
										
										
											2017-10-04 11:33:30 +02:00
+								        if (strv_isempty(c->supplementary_groups))
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                return 0;
-												core: do not assert when sysconf(_SC_NGROUPS_MAX) fails (#4466)

Remove the assert and check the return code of sysconf(_SC_NGROUPS_MAX).

_SC_NGROUPS_MAX maps to NGROUPS_MAX which is defined in <limits.h> to
65536 these days. The value is a sysctl read-only
/proc/sys/kernel/ngroups_max and the kernel assumes that it is always
positive otherwise things may break. Follow this and support only
positive values for all other case return either -errno or -EOPNOTSUPP.

Now if there are systems that want to re-write NGROUPS_MAX then they
should not pass SupplementaryGroups= in units even if it is empty, in
this case nothing fails and we just ignore supplementary groups. However
if SupplementaryGroups= is passed even if it is empty we have to assume
that there will be groups manipulation from our side or the kernel and
since the kernel always assumes that NGROUPS_MAX is positive, then
follow that and support only positive values.
											
										
										
											2016-10-24 13:13:06 +02:00
+								        /*
 								         * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 								         * be positive, otherwise fail.
 								         */
 								        errno = 0;
 								        ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 								        if (ngroups_max <= 0) {
 								                if (errno > 0)
 								                        return -errno;
 								                else
 								                        return -EOPNOTSUPP; /* For all other values */
 								        }
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        l_gids = new(gid_t, ngroups_max);
 								        if (!l_gids)
 								                return -ENOMEM;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        if (keep_groups) {
 								                /*
 								                 * Lookup the list of groups that the user belongs to, we
 								                 * avoid NSS lookups here too for gid=0.
 								                 */
 								                k = ngroups_max;
 								                if (getgrouplist(user, gid, l_gids, &k) < 0)
 								                        return -EINVAL;
 								        } else
 								                k = 0;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        STRV_FOREACH(i, c->supplementary_groups) {
 								                const char *g;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                if (k >= ngroups_max)
 								                        return -E2BIG;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                g = *i;
-												user-util: rework get_user_creds()

Let's fold get_user_creds_clean() into get_user_creds(), and introduce a
flags argument for it to select "clean" behaviour. This flags parameter
also learns to other new flags:

- USER_CREDS_SYNTHESIZE_FALLBACK: in this mode the user records for
  root/nobody are only synthesized as fallback. Normally, the synthesized
  records take precedence over what is in the user database.  With this
  flag set this is reversed, and the user database takes precedence, and
  the synthesized records are only used if they are missing there. This
  flag should be set in cases where doing NSS is deemed safe, and where
  there's interest in knowing the correct shell, for example if the
  admin changed root's shell to zsh or suchlike.

- USER_CREDS_ALLOW_MISSING: if set, and a UID/GID is specified by
  numeric value, and there's no user/group record for it accept it
  anyway. This allows us to fix #9767

This then also ports all users to set the most appropriate flags.

Fixes: #9767

[zj: remove one isempty() call]

											
										
										
											2018-08-02 18:36:47 +02:00
+								                r = get_group_creds(&g, l_gids+k, 0);
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                if (r < 0)
 								                        return r;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                k++;
 								        }
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        /*
 								         * Sets ngids to zero to drop all supplementary groups, happens
 								         * when we are under root and SupplementaryGroups= is empty.
 								         */
 								        if (k == 0) {
 								                *ngids = 0;
 								                return 0;
 								        }
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        /* Otherwise get the final list of supplementary groups */
 								        groups = memdup(l_gids, sizeof(gid_t) * k);
 								        if (!groups)
 								                return -ENOMEM;
 								        *supplementary_gids = groups;
 								        *ngids = k;
 								        groups = NULL;
 								        return 0;
 								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        int r;
-												core: cleanup for enforce_groups() (#7064)

SupplementaryGroups= is preprocessed in get_supplementary_groups().
So, it is not necessary to input ExecContext to enforce_groups().
											
										
										
											2017-10-12 08:10:25 +02:00
+								        /* Handle SupplementaryGroups= if it is not empty */
 								        if (ngids > 0) {
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                r = maybe_setgroups(ngids, supplementary_gids);
 								                if (r < 0)
-												user-util: rework maybe_setgroups() a bit

Let's drop the caching of the setgroups /proc field for now. While there's a
strict regime in place when it changes states, let's better not cache it since
we cannot really be sure we follow that regime correctly.

More importantly however, this is not in performance sensitive code, and
there's no indication the cache is really beneficial, hence let's drop the
caching and make things a bit simpler.

Also, while we are at it, rework the error handling a bit, and always return
negative errno-style error codes, following our usual coding style. This has
the benefit that we can sensible hanld read_one_line_file() errors, without
having to updat errno explicitly.

											
										
										
											2016-10-06 17:54:12 +02:00
+								                        return r;
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        }
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        if (gid_is_valid(gid)) {
 								                /* Then set our gids */
 								                if (setresgid(gid, gid, gid) < 0)
 								                        return -errno;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
+								        }
 								        return 0;
 								}
 								static int enforce_user(const ExecContext *context, uid_t uid) {
 								        assert(context);
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        if (!uid_is_valid(uid))
 								                return 0;
-												core: drop Capabilities= setting

The setting is hardly useful (since its effect is generally reduced to zero due
to file system caps), and with the advent of ambient caps an actually useful
replacement exists, hence let's get rid of this.

I am pretty sure this was unused and our man page already recommended against
its use, hence this should be a safe thing to remove.

											
										
										
											2016-02-12 23:29:57 +01:00
+								        /* Sets (but doesn't look up) the uid and make sure we keep the
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
+								         * capabilities while doing so. */
-												core: drop Capabilities= setting

The setting is hardly useful (since its effect is generally reduced to zero due
to file system caps), and with the advent of ambient caps an actually useful
replacement exists, hence let's get rid of this.

I am pretty sure this was unused and our man page already recommended against
its use, hence this should be a safe thing to remove.

											
										
										
											2016-02-12 23:29:57 +01:00
+								        if (context->capability_ambient_set != 0) {
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
 								                /* First step: If we need to keep capabilities but
 								                 * drop privileges we need to make sure we keep our
-												execute: convert secure bits into mask properly

C.f. http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=5975c725dfd6f7d36f493ab1453fbdbd35c1f0e3

											
										
										
											2013-03-30 06:40:11 +01:00
+								                 * caps, while we drop privileges. */
-												execute: try to suppress PR_SET_SECUREBITS if unnecessary

											
										
										
											2010-03-31 16:25:33 +02:00
+								                if (uid != 0) {
-												execute: convert secure bits into mask properly

C.f. http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=5975c725dfd6f7d36f493ab1453fbdbd35c1f0e3

											
										
										
											2013-03-30 06:40:11 +01:00
+								                        int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
-												execute: try to suppress PR_SET_SECUREBITS if unnecessary

											
										
										
											2010-03-31 16:25:33 +02:00
 								                        if (prctl(PR_GET_SECUREBITS) != sb)
 								                                if (prctl(PR_SET_SECUREBITS, sb) < 0)
 								                                        return -errno;
 								                }
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
+								        }
-												core: drop Capabilities= setting

The setting is hardly useful (since its effect is generally reduced to zero due
to file system caps), and with the advent of ambient caps an actually useful
replacement exists, hence let's get rid of this.

I am pretty sure this was unused and our man page already recommended against
its use, hence this should be a safe thing to remove.

											
										
										
											2016-02-12 23:29:57 +01:00
+								        /* Second step: actually set the uids */
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
+								        if (setresuid(uid, uid, uid) < 0)
 								                return -errno;
 								        /* At this point we should have all necessary capabilities but
 								           are otherwise a normal user. However, the caps might got
 								           corrupted due to the setresuid() so we need clean them up
 								           later. This is done outside of this call. */
 								        return 0;
 								}
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_PAM
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								static int null_conv(
 								                int num_msg,
 								                const struct pam_message **msg,
 								                struct pam_response **resp,
 								                void *appdata_ptr) {
 								        /* We don't support conversations */
 								        return PAM_CONV_ERR;
 								}
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
+								#endif
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								static int setup_pam(
 								                const char *name,
 								                const char *user,
-												sd-pam: Drop uid so parent signal arrives at child.

The PAM helper thread needs to capture the death signal from the
parent, but is prohibited from doing so since when the child dies
as normal user, the kernel won't allow it to send a TERM to the
PAM helper thread which is running as root.

This causes the PAM threads to never exit, accumulating after
user sessions exit.

There is however really no need to keep the PAM threads running as
root, so, we can just setresuid() to the same user as defined in the
unit file for the parent thread (User=). This makes the TERM signal
arrive as normal. In case setresuid() fails, we ignore the error, so
we at least fall back to the current behaviour.

											
										
										
											2012-05-17 21:17:42 +02:00
+								                uid_t uid,
-												core: leave PAM stub process around with GIDs updated

In the process execution code of PID 1, before
096424d1230e0a0339735c51b43949809e972430 the GID settings where changed before
invoking PAM, and the UID settings after. After the change both changes are
made after the PAM session hooks are run. When invoking PAM we fork once, and
leave a stub process around which will invoke the PAM session end hooks when
the session goes away. This code previously was dropping the remaining privs
(which were precisely the UID). Fix this code to do this correctly again, by
really dropping them else (i.e. the GID as well).

While we are at it, also fix error logging of this code.

Fixes: #4238

											
										
										
											2016-10-06 16:03:01 +02:00
+								                gid_t gid,
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                const char *tty,
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								                char ***env,
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								                int fds[], size_t n_fds) {
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_PAM
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        static const struct pam_conv conv = {
 								                .conv = null_conv,
 								                .appdata_ptr = NULL
 								        };
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
+								        _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        pam_handle_t *handle = NULL;
-												core: execute: fix regression in pam_setup()

Commit 72c0a2c25 ("everywhere: port everything to sigprocmask_many()
and friends") reworked code tree-wide to use the new sigprocmask_many()
helper. In this, it caused a regression in pam_setup, because it
dropped a line to initialize the 'ss' signal mask which is later used
in sigwait().

While at it, move the variable declaration to an inner scope.

											
										
										
											2015-06-17 14:31:49 +02:00
+								        sigset_t old_ss;
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								        int pam_code = PAM_SUCCESS, r;
-												execute: Do not alter call-by-ref parameter on failure

Prevent free from being called on (a part of) the call-by-reference
variable env when setup_pam fails.

											
										
										
											2016-07-07 12:41:52 +02:00
+								        char **nv, **e = NULL;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        bool close_session = false;
 								        pid_t pam_pid = 0, parent_pid;
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								        int flags = 0;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								        assert(name);
 								        assert(user);
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        assert(env);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								        /* We set up PAM in the parent process, then fork. The child
-												Spelling Corrections

Just some lame spelling corrections with no functionality.

											
										
										
											2011-02-21 15:32:17 +01:00
+								         * will then stay around until killed via PR_GET_PDEATHSIG or
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								         * systemd via the cgroup logic. It will then remove the PAM
 								         * session again. The parent process will exec() the actual
 								         * daemon. We do things this way to ensure that the main PID
 								         * of the daemon is the one we initially fork()ed. */
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								        r = barrier_create(&barrier);
 								        if (r < 0)
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
+								                goto fail;
-												tree-wide: remove unnecessary LOG_PRI

LOG_DEBUG is already a log level, there is no need to use LOG_PRI which
is for filtering out the facility.

											
										
										
											2015-01-06 06:29:40 +01:00
+								        if (log_get_max_level() < LOG_DEBUG)
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                flags |= PAM_SILENT;
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        pam_code = pam_start(name, user, &conv, &handle);
 								        if (pam_code != PAM_SUCCESS) {
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                handle = NULL;
 								                goto fail;
 								        }
-												core: when setting up PAM, try to get tty of STDIN_FILENO if not set explicitly

When stdin/stdout/stderr is initialized from an fd, let's read the tty
name of it if we can, and pass that to PAM.

This makes sure that "machinectl shell" sessions have proper TTY fields
initialized that "loginctl" then shows.

											
										
										
											2018-08-03 21:30:16 +02:00
+								        if (!tty) {
 								                _cleanup_free_ char *q = NULL;
 								                /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
 								                 * out if that's the case, and read the TTY off it. */
 								                if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
 								                        tty = strjoina("/dev/", q);
 								        }
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        if (tty) {
 								                pam_code = pam_set_item(handle, PAM_TTY, tty);
 								                if (pam_code != PAM_SUCCESS)
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                        goto fail;
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												execute: Do not alter call-by-ref parameter on failure

Prevent free from being called on (a part of) the call-by-reference
variable env when setup_pam fails.

											
										
										
											2016-07-07 12:41:52 +02:00
+								        STRV_FOREACH(nv, *env) {
 								                pam_code = pam_putenv(handle, *nv);
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								                if (pam_code != PAM_SUCCESS)
 								                        goto fail;
 								        }
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								        pam_code = pam_acct_mgmt(handle, flags);
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        if (pam_code != PAM_SUCCESS)
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                goto fail;
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								        pam_code = pam_open_session(handle, flags);
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        if (pam_code != PAM_SUCCESS)
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                goto fail;
 								        close_session = true;
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        e = pam_getenvlist(handle);
 								        if (!e) {
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                pam_code = PAM_BUF_ERR;
 								                goto fail;
 								        }
 								        /* Block SIGTERM, so that we know that it won't get lost in
 								         * the child */
-												tree-wide: whenever we fork off a foreign child process reset signal mask/handlers

Also, when the child is potentially long-running make sure to set a
death signal.

Also, ignore the result of the reset operations explicitly by casting
them to (void).

											
										
										
											2015-05-31 23:55:55 +02:00
-												everywhere: port everything to sigprocmask_many() and friends

This ports a lot of manual code over to sigprocmask_many() and friends.

Also, we now consistly check for sigprocmask() failures with
assert_se(), since the call cannot realistically fail unless there's a
programming error.

Also encloses a few sd_event_add_signal() calls with (void) when we
ignore the return values for it knowingly.

											
										
										
											2015-06-15 20:13:23 +02:00
+								        assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												tree-wide: make use of getpid_cached() wherever we can

This moves pretty much all uses of getpid() over to getpid_raw(). I
didn't specifically check whether the optimization is worth it for each
replacement, but in order to keep things simple and systematic I
switched over everything at once.

											
										
										
											2017-07-20 16:19:18 +02:00
+								        parent_pid = getpid_cached();
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												tree-wide: introduce new safe_fork() helper and port everything over

This adds a new safe_fork() wrapper around fork() and makes use of it
everywhere. The new wrapper does a couple of things we previously did
manually and separately in a safer, more correct and automatic way:

1. Optionally resets signal handlers/mask in the child

2. Sets a name on all processes we fork off right after forking off (and
   the patch assigns useful names for all processes we fork off now,
   following a systematic naming scheme: always enclosed in () – in order
   to indicate that these are not proper, exec()ed processes, but only
   forked off children, and if the process is long-running with only our
   own code, without execve()'ing something else, it gets am "sd-" prefix.)

3. Optionally closes all file descriptors in the child

4. Optionally sets a PR_SET_DEATHSIG to SIGTERM in the child, in a safe
   way so that the parent dying before this happens being handled
   safely.

5. Optionally reopens the logs

6. Optionally connects stdin/stdout/stderr to /dev/null

7. Debug logs about the forked off processes.

											
										
										
											2017-12-22 13:08:14 +01:00
+								        r = safe_fork("(sd-pam)", 0, &pam_pid);
 								        if (r < 0)
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                goto fail;
-												tree-wide: introduce new safe_fork() helper and port everything over

This adds a new safe_fork() wrapper around fork() and makes use of it
everywhere. The new wrapper does a couple of things we previously did
manually and separately in a safer, more correct and automatic way:

1. Optionally resets signal handlers/mask in the child

2. Sets a name on all processes we fork off right after forking off (and
   the patch assigns useful names for all processes we fork off now,
   following a systematic naming scheme: always enclosed in () – in order
   to indicate that these are not proper, exec()ed processes, but only
   forked off children, and if the process is long-running with only our
   own code, without execve()'ing something else, it gets am "sd-" prefix.)

3. Optionally closes all file descriptors in the child

4. Optionally sets a PR_SET_DEATHSIG to SIGTERM in the child, in a safe
   way so that the parent dying before this happens being handled
   safely.

5. Optionally reopens the logs

6. Optionally connects stdin/stdout/stderr to /dev/null

7. Debug logs about the forked off processes.

											
										
										
											2017-12-22 13:08:14 +01:00
+								        if (r == 0) {
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								                int sig, ret = EXIT_PAM;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								                /* The child's job is to reset the PAM session on
 								                 * termination */
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
+								                barrier_set_role(&barrier, BARRIER_CHILD);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												tree-wide: introduce new safe_fork() helper and port everything over

This adds a new safe_fork() wrapper around fork() and makes use of it
everywhere. The new wrapper does a couple of things we previously did
manually and separately in a safer, more correct and automatic way:

1. Optionally resets signal handlers/mask in the child

2. Sets a name on all processes we fork off right after forking off (and
   the patch assigns useful names for all processes we fork off now,
   following a systematic naming scheme: always enclosed in () – in order
   to indicate that these are not proper, exec()ed processes, but only
   forked off children, and if the process is long-running with only our
   own code, without execve()'ing something else, it gets am "sd-" prefix.)

3. Optionally closes all file descriptors in the child

4. Optionally sets a PR_SET_DEATHSIG to SIGTERM in the child, in a safe
   way so that the parent dying before this happens being handled
   safely.

5. Optionally reopens the logs

6. Optionally connects stdin/stdout/stderr to /dev/null

7. Debug logs about the forked off processes.

											
										
										
											2017-12-22 13:08:14 +01:00
+								                /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
 								                 * are open here that have been opened by PAM. */
 								                (void) close_many(fds, n_fds);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												sd-pam: Drop uid so parent signal arrives at child.

The PAM helper thread needs to capture the death signal from the
parent, but is prohibited from doing so since when the child dies
as normal user, the kernel won't allow it to send a TERM to the
PAM helper thread which is running as root.

This causes the PAM threads to never exit, accumulating after
user sessions exit.

There is however really no need to keep the PAM threads running as
root, so, we can just setresuid() to the same user as defined in the
unit file for the parent thread (User=). This makes the TERM signal
arrive as normal. In case setresuid() fails, we ignore the error, so
we at least fall back to the current behaviour.

											
										
										
											2012-05-17 21:17:42 +02:00
+								                /* Drop privileges - we don't need any to pam_close_session
 								                 * and this will make PR_SET_PDEATHSIG work in most cases.
 								                 * If this fails, ignore the error - but expect sd-pam threads
 								                 * to fail to exit normally */
-												core: leave PAM stub process around with GIDs updated

In the process execution code of PID 1, before
096424d1230e0a0339735c51b43949809e972430 the GID settings where changed before
invoking PAM, and the UID settings after. After the change both changes are
made after the PAM session hooks are run. When invoking PAM we fork once, and
leave a stub process around which will invoke the PAM session end hooks when
the session goes away. This code previously was dropping the remaining privs
(which were precisely the UID). Fix this code to do this correctly again, by
really dropping them else (i.e. the GID as well).

While we are at it, also fix error logging of this code.

Fixes: #4238

											
										
										
											2016-10-06 16:03:01 +02:00
-												user-util: rework maybe_setgroups() a bit

Let's drop the caching of the setgroups /proc field for now. While there's a
strict regime in place when it changes states, let's better not cache it since
we cannot really be sure we follow that regime correctly.

More importantly however, this is not in performance sensitive code, and
there's no indication the cache is really beneficial, hence let's drop the
caching and make things a bit simpler.

Also, while we are at it, rework the error handling a bit, and always return
negative errno-style error codes, following our usual coding style. This has
the benefit that we can sensible hanld read_one_line_file() errors, without
having to updat errno explicitly.

											
										
										
											2016-10-06 17:54:12 +02:00
+								                r = maybe_setgroups(0, NULL);
 								                if (r < 0)
 								                        log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
-												core: leave PAM stub process around with GIDs updated

In the process execution code of PID 1, before
096424d1230e0a0339735c51b43949809e972430 the GID settings where changed before
invoking PAM, and the UID settings after. After the change both changes are
made after the PAM session hooks are run. When invoking PAM we fork once, and
leave a stub process around which will invoke the PAM session end hooks when
the session goes away. This code previously was dropping the remaining privs
(which were precisely the UID). Fix this code to do this correctly again, by
really dropping them else (i.e. the GID as well).

While we are at it, also fix error logging of this code.

Fixes: #4238

											
										
										
											2016-10-06 16:03:01 +02:00
+								                if (setresgid(gid, gid, gid) < 0)
 								                        log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
-												sd-pam: Drop uid so parent signal arrives at child.

The PAM helper thread needs to capture the death signal from the
parent, but is prohibited from doing so since when the child dies
as normal user, the kernel won't allow it to send a TERM to the
PAM helper thread which is running as root.

This causes the PAM threads to never exit, accumulating after
user sessions exit.

There is however really no need to keep the PAM threads running as
root, so, we can just setresuid() to the same user as defined in the
unit file for the parent thread (User=). This makes the TERM signal
arrive as normal. In case setresuid() fails, we ignore the error, so
we at least fall back to the current behaviour.

											
										
										
											2012-05-17 21:17:42 +02:00
+								                if (setresuid(uid, uid, uid) < 0)
-												core: leave PAM stub process around with GIDs updated

In the process execution code of PID 1, before
096424d1230e0a0339735c51b43949809e972430 the GID settings where changed before
invoking PAM, and the UID settings after. After the change both changes are
made after the PAM session hooks are run. When invoking PAM we fork once, and
leave a stub process around which will invoke the PAM session end hooks when
the session goes away. This code previously was dropping the remaining privs
(which were precisely the UID). Fix this code to do this correctly again, by
really dropping them else (i.e. the GID as well).

While we are at it, also fix error logging of this code.

Fixes: #4238

											
										
										
											2016-10-06 16:03:01 +02:00
+								                        log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
-												sd-pam: Drop uid so parent signal arrives at child.

The PAM helper thread needs to capture the death signal from the
parent, but is prohibited from doing so since when the child dies
as normal user, the kernel won't allow it to send a TERM to the
PAM helper thread which is running as root.

This causes the PAM threads to never exit, accumulating after
user sessions exit.

There is however really no need to keep the PAM threads running as
root, so, we can just setresuid() to the same user as defined in the
unit file for the parent thread (User=). This makes the TERM signal
arrive as normal. In case setresuid() fails, we ignore the error, so
we at least fall back to the current behaviour.

											
										
										
											2012-05-17 21:17:42 +02:00
-												tree-wide: whenever we fork off a foreign child process reset signal mask/handlers

Also, when the child is potentially long-running make sure to set a
death signal.

Also, ignore the result of the reset operations explicitly by casting
them to (void).

											
										
										
											2015-05-31 23:55:55 +02:00
+								                (void) ignore_signals(SIGPIPE, -1);
-												sd-pam: Drop uid so parent signal arrives at child.

The PAM helper thread needs to capture the death signal from the
parent, but is prohibited from doing so since when the child dies
as normal user, the kernel won't allow it to send a TERM to the
PAM helper thread which is running as root.

This causes the PAM threads to never exit, accumulating after
user sessions exit.

There is however really no need to keep the PAM threads running as
root, so, we can just setresuid() to the same user as defined in the
unit file for the parent thread (User=). This makes the TERM signal
arrive as normal. In case setresuid() fails, we ignore the error, so
we at least fall back to the current behaviour.

											
										
										
											2012-05-17 21:17:42 +02:00
+								                /* Wait until our parent died. This will only work if
 								                 * the above setresuid() succeeds, otherwise the kernel
 								                 * will not allow unprivileged parents kill their privileged
 								                 * children this way. We rely on the control groups kill logic
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                 * to do the rest for us. */
 								                if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
 								                        goto child_finish;
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
+								                /* Tell the parent that our setup is done. This is especially
 								                 * important regarding dropping privileges. Otherwise, unit
-												core/execute: add (void)

CID #778045.

											
										
										
											2017-02-19 19:48:59 +01:00
+								                 * setup might race against our setresuid(2) call.
 								                 *
 								                 * If the parent aborted, we'll detect this below, hence ignore
 								                 * return failure here. */
 								                (void) barrier_place(&barrier);
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
-												core/execute: add (void)

CID #778045.

											
										
										
											2017-02-19 19:48:59 +01:00
+								                /* Check if our parent process might already have died? */
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                if (getppid() == parent_pid) {
-												core: execute: fix regression in pam_setup()

Commit 72c0a2c25 ("everywhere: port everything to sigprocmask_many()
and friends") reworked code tree-wide to use the new sigprocmask_many()
helper. In this, it caused a regression in pam_setup, because it
dropped a line to initialize the 'ss' signal mask which is later used
in sigwait().

While at it, move the variable declaration to an inner scope.

											
										
										
											2015-06-17 14:31:49 +02:00
+								                        sigset_t ss;
 								                        assert_se(sigemptyset(&ss) >= 0);
 								                        assert_se(sigaddset(&ss, SIGTERM) >= 0);
-												execute: invoke sigwait() in a loop when waiting for PAM parent, to avoid spurious wake-ups

											
										
										
											2011-06-30 04:15:39 +02:00
+								                        for (;;) {
 								                                if (sigwait(&ss, &sig) < 0) {
 								                                        if (errno == EINTR)
 								                                                continue;
 								                                        goto child_finish;
 								                                }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												execute: invoke sigwait() in a loop when waiting for PAM parent, to avoid spurious wake-ups

											
										
										
											2011-06-30 04:15:39 +02:00
+								                                assert(sig == SIGTERM);
 								                                break;
 								                        }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                }
-												execute: invoke sigwait() in a loop when waiting for PAM parent, to avoid spurious wake-ups

											
										
										
											2011-06-30 04:15:39 +02:00
+								                /* If our parent died we'll end the session */
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								                if (getppid() != parent_pid) {
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                        pam_code = pam_close_session(handle, flags);
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								                        if (pam_code != PAM_SUCCESS)
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                                goto child_finish;
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								                }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								                ret = 0;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								        child_finish:
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                pam_end(handle, pam_code | flags);
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								                _exit(ret);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        }
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
+								        barrier_set_role(&barrier, BARRIER_PARENT);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        /* If the child was forked off successfully it will do all the
 								         * cleanups, so forget about the handle here. */
 								        handle = NULL;
-												execute: do initgroups() first, pam initialization second so that it can still modify the groups list

											
										
										
											2011-06-30 02:15:01 +02:00
+								        /* Unblock SIGTERM again in the parent */
-												everywhere: port everything to sigprocmask_many() and friends

This ports a lot of manual code over to sigprocmask_many() and friends.

Also, we now consistly check for sigprocmask() failures with
assert_se(), since the call cannot realistically fail unless there's a
programming error.

Also encloses a few sd_event_add_signal() calls with (void) when we
ignore the return values for it knowingly.

											
										
										
											2015-06-15 20:13:23 +02:00
+								        assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								        /* We close the log explicitly here, since the PAM modules
 								         * might have opened it, but we don't want this fd around. */
 								        closelog();
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
+								        /* Synchronously wait for the child to initialize. We don't care for
 								         * errors as we cannot recover. However, warn loudly if it happens. */
 								        if (!barrier_place_and_sync(&barrier))
 								                log_error("PAM initialization failed");
-												tree-wide: use strv_free_and_replace() macro

											
										
										
											2018-05-09 17:34:46 +02:00
+								        return strv_free_and_replace(*env, e);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								fail:
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								        if (pam_code != PAM_SUCCESS) {
 								                log_error("PAM failed: %s", pam_strerror(handle, pam_code));
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								                r = -EPERM;  /* PAM errors do not map to errno */
 								        } else
 								                log_error_errno(r, "PAM failed: %m");
-												execute: make setup_pam() return -errno when possible

The only caller currently checks if the result is non-zero,
so nothing changes there.

											
										
										
											2011-11-17 00:16:22 +01:00
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        if (handle) {
 								                if (close_session)
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                        pam_code = pam_close_session(handle, flags);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                pam_end(handle, pam_code | flags);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        }
 								        strv_free(e);
 								        closelog();
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								        return r;
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
+								#else
 								        return 0;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								#endif
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
+								}
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												exec: include path name of binary we are about to execute when renaming forked off processes

Immediately after forking off a process change the comm name and argv[0]
to "(foobar)" where "foobar" is the basename of the path we are about to
execute.

This should be useful when charting boot progress.

											
										
										
											2012-02-01 22:33:15 +01:00
+								static void rename_process_from_path(const char *path) {
 								        char process_name[11];
 								        const char *p;
 								        size_t l;
 								        /* This resulting string must fit in 10 chars (i.e. the length
 								         * of "/sbin/init") to look pretty in /bin/ps */
-												Get rid of our reimplementation of basename

The only problem is that libgen.h #defines basename to point to it's
own broken implementation instead of the GNU one. This can be fixed
by #undefining basename.

											
										
										
											2013-12-07 03:29:55 +01:00
+								        p = basename(path);
-												exec: include path name of binary we are about to execute when renaming forked off processes

Immediately after forking off a process change the comm name and argv[0]
to "(foobar)" where "foobar" is the basename of the path we are about to
execute.

This should be useful when charting boot progress.

											
										
										
											2012-02-01 22:33:15 +01:00
+								        if (isempty(p)) {
 								                rename_process("(...)");
 								                return;
 								        }
 								        l = strlen(p);
 								        if (l > 8) {
 								                /* The end of the process name is usually more
 								                 * interesting, since the first bit might just be
 								                 * "systemd-" */
 								                p = p + l - 8;
 								                l = 8;
 								        }
 								        process_name[0] = '(';
 								        memcpy(process_name+1, p, l);
 								        process_name[1+l] = ')';
 								        process_name[1+l+1] = 0;
 								        rename_process(process_name);
 								}
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								static bool context_has_address_families(const ExecContext *c) {
 								        assert(c);
 								        return c->address_families_whitelist ||
 								                !set_isempty(c->address_families);
 								}
 								static bool context_has_syscall_filters(const ExecContext *c) {
 								        assert(c);
 								        return c->syscall_whitelist ||
-												core: add support to specify errno in SystemCallFilter=

This makes each system call in SystemCallFilter= blacklist optionally
takes errno name or number after a colon. The errno takes precedence
over the one given by SystemCallErrorNumber=.

C.f. #7173.
Closes #7169.

											
										
										
											2017-11-11 13:35:49 +01:00
+								                !hashmap_isempty(c->syscall_filter);
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								}
 								static bool context_has_no_new_privileges(const ExecContext *c) {
 								        assert(c);
 								        if (c->no_new_privileges)
 								                return true;
 								        if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
 								                return false;
 								        /* We need NNP if we have any form of seccomp and are unprivileged */
 								        return context_has_address_families(c) ||
 								                c->memory_deny_write_execute ||
 								                c->restrict_realtime ||
 								                exec_context_restrict_namespaces_set(c) ||
 								                c->protect_kernel_tunables ||
 								                c->protect_kernel_modules ||
 								                c->private_devices ||
 								                context_has_syscall_filters(c) ||
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
+								                !set_isempty(c->syscall_archs) ||
 								                c->lock_personality;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								}
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
-												core: simplify skip_seccomp_unavailable() a bit

Let's prefer early-exit over deep-indented if blocks. Not behavioural change.

											
										
										
											2016-10-21 20:03:51 +02:00
 								        if (is_seccomp_available())
 								                return false;
 								        log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
 								        return true;
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								}
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        uint32_t negative_action, default_action, action;
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        int r;
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												syscallfilter: port to libseccomp

											
										
										
											2014-02-12 01:29:54 +01:00
+								        assert(c);
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!context_has_syscall_filters(c))
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (skip_seccomp_unavailable(u, "SystemCallFilter="))
 								                return 0;
-												seccomp: add helper call to add all secondary archs to a seccomp filter

And make use of it where appropriate for executing services and for
nspawn.

											
										
										
											2014-02-18 22:14:00 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
-												seccomp: add helper call to add all secondary archs to a seccomp filter

And make use of it where appropriate for executing services and for
nspawn.

											
										
										
											2014-02-18 22:14:00 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (c->syscall_whitelist) {
 								                default_action = negative_action;
 								                action = SCMP_ACT_ALLOW;
-												seccomp: we should control NO_NEW_PRIVS on our own, not let seccomp do this for us

											
										
										
											2014-02-25 20:32:27 +01:00
+								        } else {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                default_action = SCMP_ACT_ALLOW;
 								                action = negative_action;
-												core: add SystemCallArchitectures= unit setting to allow disabling of non-native
architecture support for system calls

Also, turn system call filter bus properties into complex types instead
of concatenated strings.

											
										
										
											2014-02-13 00:24:00 +01:00
+								        }
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_ambient_hack) {
 								                r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
 								                if (r < 0)
 								                        return r;
 								        }
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
+								}
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
 								        assert(u);
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
+								        assert(c);
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (set_isempty(c->syscall_archs))
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
 								                return 0;
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_restrict_archs(c->syscall_archs);
 								}
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								static int apply_address_families(const Unit* u, const ExecContext *c) {
 								        assert(u);
 								        assert(c);
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!context_has_address_families(c))
 								                return 0;
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
 								                return 0;
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
+								}
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
+								        assert(c);
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!c->memory_deny_write_execute)
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
 								                return 0;
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_memory_deny_write_execute();
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
+								}
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								        assert(c);
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!c->restrict_realtime)
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (skip_seccomp_unavailable(u, "RestrictRealtime="))
 								                return 0;
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_restrict_realtime();
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								}
-												core: make unit argument const for apply seccomp functions

											
										
										
											2016-10-27 09:39:20 +02:00
+								static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								        assert(c);
 								        /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
 								         * let's protect even those systems where this is left on in the kernel. */
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!c->protect_kernel_tunables)
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
 								                return 0;
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_protect_sysctl();
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								}
-												core: make unit argument const for apply seccomp functions

											
										
										
											2016-10-27 09:39:20 +02:00
+								static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												core:sandbox: Add ProtectKernelModules= option

This is useful to turn off explicit module load and unload operations on modular
kernels. This option removes CAP_SYS_MODULE from the capability bounding set for
the unit, and installs a system call filter to block module system calls.

This option will not prevent the kernel from loading modules using the module
auto-load feature which is a system wide operation.

											
										
										
											2016-10-12 13:31:21 +02:00
+								        assert(c);
-												core: rework apply_protect_kernel_modules() to use seccomp_add_syscall_filter_set()

Let's simplify this call, by making use of the new infrastructure.

This is actually more in line with Djalal's original patch but instead of
search the filter set in the array by its name we can now use the set index and
jump directly to it.

											
										
										
											2016-10-21 20:12:33 +02:00
+								        /* Turn off module syscalls on ProtectKernelModules=yes */
-												core:sandbox: Add ProtectKernelModules= option

This is useful to turn off explicit module load and unload operations on modular
kernels. This option removes CAP_SYS_MODULE from the capability bounding set for
the unit, and installs a system call filter to block module system calls.

This option will not prevent the kernel from loading modules using the module
auto-load feature which is a system wide operation.

											
										
										
											2016-10-12 13:31:21 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!c->protect_kernel_modules)
 								                return 0;
-												core:sandbox: Add ProtectKernelModules= option

This is useful to turn off explicit module load and unload operations on modular
kernels. This option removes CAP_SYS_MODULE from the capability bounding set for
the unit, and installs a system call filter to block module system calls.

This option will not prevent the kernel from loading modules using the module
auto-load feature which is a system wide operation.

											
										
										
											2016-10-12 13:31:21 +02:00
+								        if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
 								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
-												core:sandbox: Add ProtectKernelModules= option

This is useful to turn off explicit module load and unload operations on modular
kernels. This option removes CAP_SYS_MODULE from the capability bounding set for
the unit, and installs a system call filter to block module system calls.

This option will not prevent the kernel from loading modules using the module
auto-load feature which is a system wide operation.

											
										
										
											2016-10-12 13:31:21 +02:00
+								}
-												core: make unit argument const for apply seccomp functions

											
										
										
											2016-10-27 09:39:20 +02:00
+								static int apply_private_devices(const Unit *u, const ExecContext *c) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												execute: filter low-level I/O syscalls if PrivateDevices= is set

If device access is restricted via PrivateDevices=, let's also block the
various low-level I/O syscalls at the same time, so that we know that the
minimal set of devices in our virtualized /dev are really everything the unit
can access.

											
										
										
											2016-08-26 16:39:04 +02:00
+								        assert(c);
-												core: Use @raw-io syscall group to filter I/O syscalls when PrivateDevices= is set

Instead of having a local syscall list, use the @raw-io group which
contains the same set of syscalls to filter.

											
										
										
											2016-09-25 12:52:27 +02:00
+								        /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
-												execute: filter low-level I/O syscalls if PrivateDevices= is set

If device access is restricted via PrivateDevices=, let's also block the
various low-level I/O syscalls at the same time, so that we know that the
minimal set of devices in our virtualized /dev are really everything the unit
can access.

											
										
										
											2016-08-26 16:39:04 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!c->private_devices)
 								                return 0;
-												execute: filter low-level I/O syscalls if PrivateDevices= is set

If device access is restricted via PrivateDevices=, let's also block the
various low-level I/O syscalls at the same time, so that we know that the
minimal set of devices in our virtualized /dev are really everything the unit
can access.

											
										
										
											2016-08-26 16:39:04 +02:00
+								        if (skip_seccomp_unavailable(u, "PrivateDevices="))
 								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
-												execute: filter low-level I/O syscalls if PrivateDevices= is set

If device access is restricted via PrivateDevices=, let's also block the
various low-level I/O syscalls at the same time, so that we know that the
minimal set of devices in our virtualized /dev are really everything the unit
can access.

											
										
										
											2016-08-26 16:39:04 +02:00
+								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								        assert(c);
 								        if (!exec_context_restrict_namespaces_set(c))
 								                return 0;
 								        if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
 								                return 0;
 								        return seccomp_restrict_namespaces(c->restrict_namespaces);
 								}
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
+								static int apply_lock_personality(const Unit* u, const ExecContext *c) {
-												seccomp: default to something resembling the current personality when locking it

Let's lock the personality to the currently set one, if nothing is
specifically specified. But do so with a grain of salt, and never
default to any exotic personality here, but only PER_LINUX or
PER_LINUX32.

											
										
										
											2017-08-09 20:40:26 +02:00
+								        unsigned long personality;
 								        int r;
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
 								        assert(u);
 								        assert(c);
 								        if (!c->lock_personality)
 								                return 0;
 								        if (skip_seccomp_unavailable(u, "LockPersonality="))
 								                return 0;
-												seccomp: default to something resembling the current personality when locking it

Let's lock the personality to the currently set one, if nothing is
specifically specified. But do so with a grain of salt, and never
default to any exotic personality here, but only PER_LINUX or
PER_LINUX32.

											
										
										
											2017-08-09 20:40:26 +02:00
+								        personality = c->personality;
 								        /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
 								        if (personality == PERSONALITY_INVALID) {
 								                r = opinionated_personality(&personality);
 								                if (r < 0)
 								                        return r;
 								        }
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
 								        return seccomp_lock_personality(personality);
 								}
-												syscallfilter: port to libseccomp

											
										
										
											2014-02-12 01:29:54 +01:00
+								#endif
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
+								static void do_idle_pipe_dance(int idle_pipe[4]) {
 								        assert(idle_pipe);
-												execute: invalidate idle pipe after use

Not strictly necessary, but makes clear the fds are invalidated. Make
sure we do the same here as in most other cases.

											
										
										
											2015-09-11 18:14:11 +02:00
+								        idle_pipe[1] = safe_close(idle_pipe[1]);
 								        idle_pipe[2] = safe_close(idle_pipe[2]);
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
 								        if (idle_pipe[0] >= 0) {
 								                int r;
 								                r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
 								                if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
-												execute: fix return type from write()

											
										
										
											2015-09-11 18:15:08 +02:00
+								                        ssize_t n;
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
+								                        /* Signal systemd that we are bored and want to continue. */
-												execute: fix return type from write()

											
										
										
											2015-09-11 18:15:08 +02:00
+								                        n = write(idle_pipe[3], "x", 1);
 								                        if (n > 0)
-												core: don't wait for reply if writing to pipe fails

This shouldn't really happen, but it's seems cleaner to
continue on error.

CID #1237552.

											
										
										
											2015-03-14 03:20:53 +01:00
+								                                /* Wait for systemd to react to the signal above. */
 								                                fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
+								                }
-												execute: invalidate idle pipe after use

Not strictly necessary, but makes clear the fds are invalidated. Make
sure we do the same here as in most other cases.

											
										
										
											2015-09-11 18:14:11 +02:00
+								                idle_pipe[0] = safe_close(idle_pipe[0]);
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
 								        }
-												execute: invalidate idle pipe after use

Not strictly necessary, but makes clear the fds are invalidated. Make
sure we do the same here as in most other cases.

											
										
										
											2015-09-11 18:14:11 +02:00
+								        idle_pipe[3] = safe_close(idle_pipe[3]);
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
+								}
-												core: add new environment variable $RUNTIME_DIRECTORY= or friends

The variable is generated from RuntimeDirectory= or friends.
If multiple directories are set, then they are concatenated with
the separator ':'.

											
										
										
											2018-09-11 07:05:08 +02:00
+								static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								static int build_environment(
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								                const Unit *u,
-												exec: factor out most function arguments of exec_spawn() to ExecParameters

This way, the list of arguments to that function gets more comprehensive,
and we can get around passing lots of NULL and 0 arguments from socket.c,
swap.c and mount.c.

It also allows for splitting up the code in exec_spawn().

While at it, make ExecContext const in execute.c.

											
										
										
											2014-08-23 15:28:37 +02:00
+								                const ExecContext *c,
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                const ExecParameters *p,
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								                size_t n_fds,
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								                const char *home,
 								                const char *username,
 								                const char *shell,
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								                dev_t journal_stream_dev,
 								                ino_t journal_stream_ino,
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								                char ***ret) {
 								        _cleanup_strv_free_ char **our_env = NULL;
-												core: add new environment variable $RUNTIME_DIRECTORY= or friends

The variable is generated from RuntimeDirectory= or friends.
If multiple directories are set, then they are concatenated with
the separator ':'.

											
										
										
											2018-09-11 07:05:08 +02:00
+								        ExecDirectoryType t;
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								        size_t n_env = 0;
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								        char *x;
-												core: add "invocation ID" concept to service manager

This adds a new invocation ID concept to the service manager. The invocation ID
identifies each runtime cycle of a unit uniquely. A new randomized 128bit ID is
generated each time a unit moves from and inactive to an activating or active
state.

The primary usecase for this concept is to connect the runtime data PID 1
maintains about a service with the offline data the journal stores about it.
Previously we'd use the unit name plus start/stop times, which however is
highly racy since the journal will generally process log data after the service
already ended.

The "invocation ID" kinda matches the "boot ID" concept of the Linux kernel,
except that it applies to an individual unit instead of the whole system.

The invocation ID is passed to the activated processes as environment variable.
It is additionally stored as extended attribute on the cgroup of the unit. The
latter is used by journald to automatically retrieve it for each log logged
message and attach it to the log entry. The environment variable is very easily
accessible, even for unprivileged services. OTOH the extended attribute is only
accessible to privileged processes (this is because cgroupfs only supports the
"trusted." xattr namespace, not "user."). The environment variable may be
altered by services, the extended attribute may not be, hence is the better
choice for the journal.

Note that reading the invocation ID off the extended attribute from journald is
racy, similar to the way reading the unit name for a logging process is.

This patch adds APIs to read the invocation ID to sd-id128:
sd_id128_get_invocation() may be used in a similar fashion to
sd_id128_get_boot().

PID1's own logging is updated to always include the invocation ID when it logs
information about a unit.

A new bus call GetUnitByInvocationID() is added that allows retrieving a bus
path to a unit by its invocation ID. The bus path is built using the invocation
ID, thus providing a path for referring to a unit that is valid only for the
current runtime cycleof it.

Outlook for the future: should the kernel eventually allow passing of cgroup
information along AF_UNIX/SOCK_DGRAM messages via a unique cgroup id, then we
can alter the invocation ID to be generated as hash from that rather than
entirely randomly. This way we can derive the invocation race-freely from the
messages.

											
										
										
											2016-08-30 23:18:46 +02:00
+								        assert(u);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								        assert(c);
-												core: add one more assert()

											
										
										
											2018-09-11 06:18:33 +02:00
+								        assert(p);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								        assert(ret);
-												core: add new environment variable $RUNTIME_DIRECTORY= or friends

The variable is generated from RuntimeDirectory= or friends.
If multiple directories are set, then they are concatenated with
the separator ':'.

											
										
										
											2018-09-11 07:05:08 +02:00
+								        our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								        if (!our_env)
 								                return -ENOMEM;
 								        if (n_fds > 0) {
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								                _cleanup_free_ char *joined = NULL;
-												tree-wide: make use of getpid_cached() wherever we can

This moves pretty much all uses of getpid() over to getpid_raw(). I
didn't specifically check whether the optimization is worth it for each
replacement, but in order to keep things simple and systematic I
switched over everything at once.

											
										
										
											2017-07-20 16:19:18 +02:00
+								                if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								                        return -ENOMEM;
 								                our_env[n_env++] = x;
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								                if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								                        return -ENOMEM;
 								                our_env[n_env++] = x;
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                joined = strv_join(p->fd_names, ":");
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								                if (!joined)
 								                        return -ENOMEM;
-												tree-wide: drop NULL sentinel from strjoin

This makes strjoin and strjoina more similar and avoids the useless final
argument.

spatch -I . -I ./src -I ./src/basic -I ./src/basic -I ./src/shared -I ./src/shared -I ./src/network -I ./src/locale -I ./src/login -I ./src/journal -I ./src/journal -I ./src/timedate -I ./src/timesync -I ./src/nspawn -I ./src/resolve -I ./src/resolve -I ./src/systemd -I ./src/core -I ./src/core -I ./src/libudev -I ./src/udev -I ./src/udev/net -I ./src/udev -I ./src/libsystemd/sd-bus -I ./src/libsystemd/sd-event -I ./src/libsystemd/sd-login -I ./src/libsystemd/sd-netlink -I ./src/libsystemd/sd-network -I ./src/libsystemd/sd-hwdb -I ./src/libsystemd/sd-device -I ./src/libsystemd/sd-id128 -I ./src/libsystemd-network --sp-file coccinelle/strjoin.cocci --in-place $(git ls-files src/*.c)

git grep -e '\bstrjoin\b.*NULL' -l|xargs sed -i -r 's/strjoin\((.*), NULL\)/strjoin(\1)/'

This might have missed a few cases (spatch has a really hard time dealing
with _cleanup_ macros), but that's no big issue, they can always be fixed
later.

											
										
										
											2016-10-23 17:43:27 +02:00
+								                x = strjoin("LISTEN_FDNAMES=", joined);
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								        }
-												core: only set the watchdog variables in ExecStart= lines

											
										
										
											2016-08-04 22:11:29 +02:00
+								        if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
-												tree-wide: make use of getpid_cached() wherever we can

This moves pretty much all uses of getpid() over to getpid_raw(). I
didn't specifically check whether the optimization is worth it for each
replacement, but in order to keep things simple and systematic I
switched over everything at once.

											
										
										
											2017-07-20 16:19:18 +02:00
+								                if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
-												sd-daemon: introduce sd_watchdog_enabled() for parsing $WATCHDOG_USEC

Also, introduce a new environment variable named $WATCHDOG_PID which
cotnains the PID of the process that is supposed to send the keep-alive
events. This is similar how $LISTEN_FDS and $LISTEN_PID work together,
and protects against confusing processes further down the process tree
due to inherited environment.

											
										
										
											2013-12-22 22:14:05 +01:00
+								                        return -ENOMEM;
 								                our_env[n_env++] = x;
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
-												sd-daemon: introduce sd_watchdog_enabled() for parsing $WATCHDOG_USEC

Also, introduce a new environment variable named $WATCHDOG_PID which
cotnains the PID of the process that is supposed to send the keep-alive
events. This is similar how $LISTEN_FDS and $LISTEN_PID work together,
and protects against confusing processes further down the process tree
due to inherited environment.

											
										
										
											2013-12-22 22:14:05 +01:00
+								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												core: bypass dynamic user lookups from dbus-daemon

dbus-daemon does NSS name look-ups in order to enforce its bus policy. This
might dead-lock if an NSS module use wants to use D-Bus for the look-up itself,
like our nss-systemd does. Let's work around this by bypassing bus
communication in the NSS module if we run inside of dbus-daemon. To make this
work we keep a bit of extra state in /run/systemd/dynamic-uid/ so that we don't
have to consult the bus, but can still resolve the names.

Note that the normal codepath continues to be via the bus, so that resolving
works from all mount namespaces and is subject to authentication, as before.

This is a bit dirty, but not too dirty, as dbus daemon is kinda special anyway
for PID 1.

											
										
										
											2016-08-02 12:28:51 +02:00
+								        /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
 								         * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
 								         * check the database directly. */
-												execute: also control the SYSTEMD_NSS_BYPASS_BUS through an ExecFlags field

Also, correct the logic while we are at it: the variable is only
required for system services, not user services.

											
										
										
											2017-08-01 10:43:04 +02:00
+								        if (p->flags & EXEC_NSS_BYPASS_BUS) {
-												core: bypass dynamic user lookups from dbus-daemon

dbus-daemon does NSS name look-ups in order to enforce its bus policy. This
might dead-lock if an NSS module use wants to use D-Bus for the look-up itself,
like our nss-systemd does. Let's work around this by bypassing bus
communication in the NSS module if we run inside of dbus-daemon. To make this
work we keep a bit of extra state in /run/systemd/dynamic-uid/ so that we don't
have to consult the bus, but can still resolve the names.

Note that the normal codepath continues to be via the bus, so that resolving
works from all mount namespaces and is subject to authentication, as before.

This is a bit dirty, but not too dirty, as dbus daemon is kinda special anyway
for PID 1.

											
										
										
											2016-08-02 12:28:51 +02:00
+								                x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
 								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								        if (home) {
 								                x = strappend("HOME=", home);
 								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
 								        if (username) {
 								                x = strappend("LOGNAME=", username);
 								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								                x = strappend("USER=", username);
 								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
 								        if (shell) {
 								                x = strappend("SHELL=", shell);
 								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												core: add "invocation ID" concept to service manager

This adds a new invocation ID concept to the service manager. The invocation ID
identifies each runtime cycle of a unit uniquely. A new randomized 128bit ID is
generated each time a unit moves from and inactive to an activating or active
state.

The primary usecase for this concept is to connect the runtime data PID 1
maintains about a service with the offline data the journal stores about it.
Previously we'd use the unit name plus start/stop times, which however is
highly racy since the journal will generally process log data after the service
already ended.

The "invocation ID" kinda matches the "boot ID" concept of the Linux kernel,
except that it applies to an individual unit instead of the whole system.

The invocation ID is passed to the activated processes as environment variable.
It is additionally stored as extended attribute on the cgroup of the unit. The
latter is used by journald to automatically retrieve it for each log logged
message and attach it to the log entry. The environment variable is very easily
accessible, even for unprivileged services. OTOH the extended attribute is only
accessible to privileged processes (this is because cgroupfs only supports the
"trusted." xattr namespace, not "user."). The environment variable may be
altered by services, the extended attribute may not be, hence is the better
choice for the journal.

Note that reading the invocation ID off the extended attribute from journald is
racy, similar to the way reading the unit name for a logging process is.

This patch adds APIs to read the invocation ID to sd-id128:
sd_id128_get_invocation() may be used in a similar fashion to
sd_id128_get_boot().

PID1's own logging is updated to always include the invocation ID when it logs
information about a unit.

A new bus call GetUnitByInvocationID() is added that allows retrieving a bus
path to a unit by its invocation ID. The bus path is built using the invocation
ID, thus providing a path for referring to a unit that is valid only for the
current runtime cycleof it.

Outlook for the future: should the kernel eventually allow passing of cgroup
information along AF_UNIX/SOCK_DGRAM messages via a unique cgroup id, then we
can alter the invocation ID to be generated as hash from that rather than
entirely randomly. This way we can derive the invocation race-freely from the
messages.

											
										
										
											2016-08-30 23:18:46 +02:00
+								        if (!sd_id128_is_null(u->invocation_id)) {
 								                if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												core: inherit TERM from PID 1 for all services started on /dev/console

This way, invoking nspawn from a shell in the best case inherits the TERM
setting all the way down into the login shell spawned in the container.

Fixes: #3697

											
										
										
											2016-07-27 15:25:55 +02:00
+								        if (exec_context_needs_term(c)) {
 								                const char *tty_path, *term = NULL;
 								                tty_path = exec_context_tty_path(c);
 								                /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
 								                 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
 								                 * passes to PID 1 ends up all the way in the console login shown. */
 								                if (path_equal(tty_path, "/dev/console") && getppid() == 1)
 								                        term = getenv("TERM");
 								                if (!term)
 								                        term = default_term_for_tty(tty_path);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
-												core: inherit TERM from PID 1 for all services started on /dev/console

This way, invoking nspawn from a shell in the best case inherits the TERM
setting all the way down into the login shell spawned in the container.

Fixes: #3697

											
										
										
											2016-07-27 15:25:55 +02:00
+								                x = strappend("TERM=", term);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								        if (journal_stream_dev != 0 && journal_stream_ino != 0) {
 								                if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												core: add new environment variable $RUNTIME_DIRECTORY= or friends

The variable is generated from RuntimeDirectory= or friends.
If multiple directories are set, then they are concatenated with
the separator ':'.

											
										
										
											2018-09-11 07:05:08 +02:00
+								        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
 								                _cleanup_free_ char *pre = NULL, *joined = NULL;
 								                const char *n;
 								                if (!p->prefix[t])
 								                        continue;
 								                if (strv_isempty(c->directories[t].paths))
 								                        continue;
 								                n = exec_directory_env_name_to_string(t);
 								                if (!n)
 								                        continue;
 								                pre = strjoin(p->prefix[t], "/");
 								                if (!pre)
 								                        return -ENOMEM;
 								                joined = strv_join_prefix(c->directories[t].paths, ":", pre);
 								                if (!joined)
 								                        return -ENOMEM;
 								                x = strjoin(n, "=", joined);
 								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								        our_env[n_env++] = NULL;
-												core: add new environment variable $RUNTIME_DIRECTORY= or friends

The variable is generated from RuntimeDirectory= or friends.
If multiple directories are set, then they are concatenated with
the separator ':'.

											
										
										
											2018-09-11 07:05:08 +02:00
+								        assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
-												macro: introduce TAKE_PTR() macro

This macro will read a pointer of any type, return it, and set the
pointer to NULL. This is useful as an explicit concept of passing
ownership of a memory area between pointers.

This takes inspiration from Rust:

https://doc.rust-lang.org/std/option/enum.Option.html#method.take

and was suggested by Alan Jenkins (@sourcejedi).

It drops ~160 lines of code from our codebase, which makes me like it.
Also, I think it clarifies passing of ownership, and thus helps
readability a bit (at least for the initiated who know the new macro)

											
										
										
											2018-03-22 16:53:26 +01:00
+								        *ret = TAKE_PTR(our_env);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
 								        return 0;
 								}
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
+								static int build_pass_environment(const ExecContext *c, char ***ret) {
 								        _cleanup_strv_free_ char **pass_env = NULL;
 								        size_t n_env = 0, n_bufsize = 0;
 								        char **i;
 								        STRV_FOREACH(i, c->pass_environment) {
 								                _cleanup_free_ char *x = NULL;
 								                char *v;
 								                v = getenv(*i);
 								                if (!v)
 								                        continue;
-												tree-wide: drop NULL sentinel from strjoin

This makes strjoin and strjoina more similar and avoids the useless final
argument.

spatch -I . -I ./src -I ./src/basic -I ./src/basic -I ./src/shared -I ./src/shared -I ./src/network -I ./src/locale -I ./src/login -I ./src/journal -I ./src/journal -I ./src/timedate -I ./src/timesync -I ./src/nspawn -I ./src/resolve -I ./src/resolve -I ./src/systemd -I ./src/core -I ./src/core -I ./src/libudev -I ./src/udev -I ./src/udev/net -I ./src/udev -I ./src/libsystemd/sd-bus -I ./src/libsystemd/sd-event -I ./src/libsystemd/sd-login -I ./src/libsystemd/sd-netlink -I ./src/libsystemd/sd-network -I ./src/libsystemd/sd-hwdb -I ./src/libsystemd/sd-device -I ./src/libsystemd/sd-id128 -I ./src/libsystemd-network --sp-file coccinelle/strjoin.cocci --in-place $(git ls-files src/*.c)

git grep -e '\bstrjoin\b.*NULL' -l|xargs sed -i -r 's/strjoin\((.*), NULL\)/strjoin(\1)/'

This might have missed a few cases (spatch has a really hard time dealing
with _cleanup_ macros), but that's no big issue, they can always be fixed
later.

											
										
										
											2016-10-23 17:43:27 +02:00
+								                x = strjoin(*i, "=", v);
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
+								                if (!x)
 								                        return -ENOMEM;
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
+								                if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
 								                        return -ENOMEM;
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
-												tree-wide: use TAKE_PTR() and TAKE_FD() macros

											
										
										
											2018-04-05 07:26:26 +02:00
+								                pass_env[n_env++] = TAKE_PTR(x);
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
+								                pass_env[n_env] = NULL;
 								        }
-												macro: introduce TAKE_PTR() macro

This macro will read a pointer of any type, return it, and set the
pointer to NULL. This is useful as an explicit concept of passing
ownership of a memory area between pointers.

This takes inspiration from Rust:

https://doc.rust-lang.org/std/option/enum.Option.html#method.take

and was suggested by Alan Jenkins (@sourcejedi).

It drops ~160 lines of code from our codebase, which makes me like it.
Also, I think it clarifies passing of ownership, and thus helps
readability a bit (at least for the initiated who know the new macro)

											
										
										
											2018-03-22 16:53:26 +01:00
+								        *ret = TAKE_PTR(pass_env);
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
 								        return 0;
 								}
-												core: make exec code a bit more readable

Let's add a function that checks whether we need fs namespacing, to make
things easier to read, instead of using a humungous if expression...

											
										
										
											2015-05-13 16:34:02 +02:00
+								static bool exec_needs_mount_namespace(
 								                const ExecContext *context,
 								                const ExecParameters *params,
-												execute: make "runtime" argument const in exec_needs_mount_namespace()

The argument can be const, then let's make so.

											
										
										
											2017-12-30 10:38:26 +01:00
+								                const ExecRuntime *runtime) {
-												core: make exec code a bit more readable

Let's add a function that checks whether we need fs namespacing, to make
things easier to read, instead of using a humungous if expression...

											
										
										
											2015-05-13 16:34:02 +02:00
 								        assert(context);
 								        assert(params);
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								        if (context->root_image)
 								                return true;
-												doc,core: Read{Write,Only}Paths= and InaccessiblePaths=

This patch renames Read{Write,Only}Directories= and InaccessibleDirectories=
to Read{Write,Only}Paths= and InaccessiblePaths=, previous names are kept
as aliases but they are not advertised in the documentation.

Renamed variables:
`read_write_dirs` --> `read_write_paths`
`read_only_dirs` --> `read_only_paths`
`inaccessible_dirs` --> `inaccessible_paths`

											
										
										
											2016-07-07 11:17:00 +02:00
+								        if (!strv_isempty(context->read_write_paths) ||
 								            !strv_isempty(context->read_only_paths) ||
 								            !strv_isempty(context->inaccessible_paths))
-												core: make exec code a bit more readable

Let's add a function that checks whether we need fs namespacing, to make
things easier to read, instead of using a humungous if expression...

											
										
										
											2015-05-13 16:34:02 +02:00
+								                return true;
-												Revert "core/execute: RuntimeDirectory= or friends requires mount namespace"

This reverts commit 652bb2637aee54e3503a22d2928a929ecd7a84b3.

Fixes #7761.

											
										
										
											2018-01-03 16:16:53 +01:00
+								        if (context->n_bind_mounts > 0)
-												core: add ability to define arbitrary bind mounts for services

This adds two new settings BindPaths= and BindReadOnlyPaths=. They allow
defining arbitrary bind mounts specific to particular services. This is
particularly useful for services with RootDirectory= set as this permits making
specific bits of the host directory available to chrooted services.

The two new settings follow the concepts nspawn already possess in --bind= and
--bind-ro=, as well as the .nspawn settings Bind= and BindReadOnly= (and these
latter options should probably be renamed to BindPaths= and BindReadOnlyPaths=
too).

Fixes: #3439

											
										
										
											2016-11-23 22:21:40 +01:00
+								                return true;
-												core: add new setting TemporaryFileSystem=

This introduces a new setting TemporaryFileSystem=. This is useful
to hide files not relevant to the processes invoked by unit, while
necessary files or directories can be still accessed by combining
with Bind{,ReadOnly}Paths=.

											
										
										
											2018-02-21 01:17:52 +01:00
+								        if (context->n_temporary_filesystems > 0)
 								                return true;
-												core: make exec code a bit more readable

Let's add a function that checks whether we need fs namespacing, to make
things easier to read, instead of using a humungous if expression...

											
										
										
											2015-05-13 16:34:02 +02:00
+								        if (context->mount_flags != 0)
 								                return true;
 								        if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
 								                return true;
 								        if (context->private_devices ||
-												core: add new PrivateMounts= unit setting

This new setting is supposed to be useful in most cases where
"MountFlags=slave" is currently used, i.e. as an explicit way to run a
service in its own mount namespace and decouple propagation from all
mounts of the new mount namespace towards the host.

The effect of MountFlags=slave and PrivateMounts=yes is mostly the same,
as both cause a CLONE_NEWNS namespace to be opened, and both will result
in all mounts within it to be mounted MS_SLAVE. The difference is mostly
on the conceptual/philosophical level: configuring the propagation mode
is nothing people should have to think about, in particular as the
matter is not precisely easyto grok. Moreover, MountFlags= allows configuration
of "private" and "slave" modes which don't really make much sense to use
in real-life and are quite confusing. In particular PrivateMounts=private means
mounts made on the host stay pinned for good by the service which is
particularly nasty for removable media mount. And PrivateMounts=shared
is in most ways a NOP when used a alone...

The main technical difference between setting only MountFlags=slave or
only PrivateMounts=yes in a unit file is that the former remounts all
mounts to MS_SLAVE and leaves them there, while that latter remounts
them to MS_SHARED again right after. The latter is generally a nicer
approach, since it disables propagation, while MS_SHARED is afterwards
in effect, which is really nice as that means further namespacing down
the tree will get MS_SHARED logic by default and we unify how
applications see our mounts as we always pass them as MS_SHARED
regardless whether any mount namespacing is used or not.

The effect of PrivateMounts=yes was implied already by all the other
mount namespacing options. With this new option we add an explicit knob
for it, to request it without any other option used as well.

See: #4393

											
										
										
											2018-06-01 11:10:49 +02:00
+								            context->private_mounts ||
-												core: make exec code a bit more readable

Let's add a function that checks whether we need fs namespacing, to make
things easier to read, instead of using a humungous if expression...

											
										
										
											2015-05-13 16:34:02 +02:00
+								            context->protect_system != PROTECT_SYSTEM_NO ||
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								            context->protect_home != PROTECT_HOME_NO ||
 								            context->protect_kernel_tunables ||
-												core:sandbox: lets make /lib/modules/ inaccessible on ProtectKernelModules=

Lets go further and make /lib/modules/ inaccessible for services that do
not have business with modules, this is a minor improvment but it may
help on setups with custom modules and they are limited... in regard of
kernel auto-load feature.

This change introduce NameSpaceInfo struct which we may embed later
inside ExecContext but for now lets just reduce the argument number to
setup_namespace() and merge ProtectKernelModules feature.

											
										
										
											2016-10-12 14:11:16 +02:00
+								            context->protect_kernel_modules ||
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								            context->protect_control_groups)
-												core: make exec code a bit more readable

Let's add a function that checks whether we need fs namespacing, to make
things easier to read, instead of using a humungous if expression...

											
										
										
											2015-05-13 16:34:02 +02:00
+								                return true;
-												core: setup mount namespace when RootDirectory= and RuntimeDirectory= or friends are set

The directories specified by RuntimeDirectory= or friends are created
on host. So, it is necessary to bind-mount them on root directory.

											
										
										
											2018-05-25 10:32:55 +02:00
+								        if (context->root_directory) {
 								                ExecDirectoryType t;
 								                if (context->mount_apivfs)
 								                        return true;
 								                for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
 								                        if (!params->prefix[t])
 								                                continue;
 								                        if (!strv_isempty(context->directories[t].paths))
 								                                return true;
 								                }
 								        }
-												core: add a per-unit setting MountAPIVFS= for mounting /dev, /proc, /sys in conjunction with RootDirectory=

This adds a boolean unit file setting MountAPIVFS=. If set, the three
main API VFS mounts will be mounted for the service. This only has an
effect on RootDirectory=, which it makes a ton times more useful.

(This is basically the /dev + /proc + /sys mounting code posted in the
original #4727, but rebased on current git, and with the automatic logic
replaced by explicit logic controlled by a unit file setting)

											
										
										
											2016-12-22 23:34:35 +01:00
-												Revert "core/execute: RuntimeDirectory= or friends requires mount namespace"

This reverts commit 652bb2637aee54e3503a22d2928a929ecd7a84b3.

Fixes #7761.

											
										
										
											2018-01-03 16:16:53 +01:00
+								        if (context->dynamic_user &&
-												core: RuntimeDirectory= does not request new mount namespace

Now RuntimeDirectory= does not create 'private' directory.
Thus, it is not neccessary to request new mount namespace.

Follow-up for 8092a48cc1d1fb20b66371576754df831d30a43b.

											
										
										
											2018-01-03 16:20:44 +01:00
+								            (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
-												Revert "core/execute: RuntimeDirectory= or friends requires mount namespace"

This reverts commit 652bb2637aee54e3503a22d2928a929ecd7a84b3.

Fixes #7761.

											
										
										
											2018-01-03 16:16:53 +01:00
+								             !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
 								             !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
 								                return true;
-												core: make exec code a bit more readable

Let's add a function that checks whether we need fs namespacing, to make
things easier to read, instead of using a humungous if expression...

											
										
										
											2015-05-13 16:34:02 +02:00
+								        return false;
 								}
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								static int setup_private_users(uid_t uid, gid_t gid) {
 								        _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
 								        _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
 								        _cleanup_close_ int unshare_ready_fd = -1;
 								        _cleanup_(sigkill_waitp) pid_t pid = 0;
 								        uint64_t c = 1;
 								        ssize_t n;
 								        int r;
 								        /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
 								         * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
 								         * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
 								         * which waits for the parent to create the new user namespace while staying in the original namespace. The
 								         * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
 								         * continues execution normally. */
-												core/execute.c: check asprintf return value in the usual fashion

This is unlikely to fail, but we cannot rely on asprintf return value
on failure, so let's just be correct here.

CID #1368227.

											
										
										
											2017-01-31 17:31:47 +01:00
+								        if (uid != 0 && uid_is_valid(uid)) {
 								                r = asprintf(&uid_map,
 								                             "0 0 1\n"                      /* Map root → root */
 								                             UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
 								                             uid, uid);
 								                if (r < 0)
 								                        return -ENOMEM;
 								        } else {
-												core: move misplaced comment to the right place

											
										
										
											2016-10-21 20:05:49 +02:00
+								                uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
-												core/execute.c: check asprintf return value in the usual fashion

This is unlikely to fail, but we cannot rely on asprintf return value
on failure, so let's just be correct here.

CID #1368227.

											
										
										
											2017-01-31 17:31:47 +01:00
+								                if (!uid_map)
 								                        return -ENOMEM;
 								        }
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
-												core/execute.c: check asprintf return value in the usual fashion

This is unlikely to fail, but we cannot rely on asprintf return value
on failure, so let's just be correct here.

CID #1368227.

											
										
										
											2017-01-31 17:31:47 +01:00
+								        if (gid != 0 && gid_is_valid(gid)) {
 								                r = asprintf(&gid_map,
 								                             "0 0 1\n"                      /* Map root → root */
 								                             GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
 								                             gid, gid);
 								                if (r < 0)
 								                        return -ENOMEM;
 								        } else {
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								                gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
-												core/execute.c: check asprintf return value in the usual fashion

This is unlikely to fail, but we cannot rely on asprintf return value
on failure, so let's just be correct here.

CID #1368227.

											
										
										
											2017-01-31 17:31:47 +01:00
+								                if (!gid_map)
 								                        return -ENOMEM;
 								        }
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
 								        /* Create a communication channel so that the parent can tell the child when it finished creating the user
 								         * namespace. */
 								        unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
 								        if (unshare_ready_fd < 0)
 								                return -errno;
 								        /* Create a communication channel so that the child can tell the parent a proper error code in case it
 								         * failed. */
 								        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
 								                return -errno;
-												tree-wide: introduce new safe_fork() helper and port everything over

This adds a new safe_fork() wrapper around fork() and makes use of it
everywhere. The new wrapper does a couple of things we previously did
manually and separately in a safer, more correct and automatic way:

1. Optionally resets signal handlers/mask in the child

2. Sets a name on all processes we fork off right after forking off (and
   the patch assigns useful names for all processes we fork off now,
   following a systematic naming scheme: always enclosed in () – in order
   to indicate that these are not proper, exec()ed processes, but only
   forked off children, and if the process is long-running with only our
   own code, without execve()'ing something else, it gets am "sd-" prefix.)

3. Optionally closes all file descriptors in the child

4. Optionally sets a PR_SET_DEATHSIG to SIGTERM in the child, in a safe
   way so that the parent dying before this happens being handled
   safely.

5. Optionally reopens the logs

6. Optionally connects stdin/stdout/stderr to /dev/null

7. Debug logs about the forked off processes.

											
										
										
											2017-12-22 13:08:14 +01:00
+								        r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
 								        if (r < 0)
 								                return r;
 								        if (r == 0) {
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								                _cleanup_close_ int fd = -1;
 								                const char *a;
 								                pid_t ppid;
 								                /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
 								                 * here, after the parent opened its own user namespace. */
 								                ppid = getppid();
 								                errno_pipe[0] = safe_close(errno_pipe[0]);
 								                /* Wait until the parent unshared the user namespace */
 								                if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
 								                        r = -errno;
 								                        goto child_fail;
 								                }
 								                /* Disable the setgroups() system call in the child user namespace, for good. */
 								                a = procfs_file_alloca(ppid, "setgroups");
 								                fd = open(a, O_WRONLY|O_CLOEXEC);
 								                if (fd < 0) {
 								                        if (errno != ENOENT) {
 								                                r = -errno;
 								                                goto child_fail;
 								                        }
 								                        /* If the file is missing the kernel is too old, let's continue anyway. */
 								                } else {
 								                        if (write(fd, "deny\n", 5) < 0) {
 								                                r = -errno;
 								                                goto child_fail;
 								                        }
 								                        fd = safe_close(fd);
 								                }
 								                /* First write the GID map */
 								                a = procfs_file_alloca(ppid, "gid_map");
 								                fd = open(a, O_WRONLY|O_CLOEXEC);
 								                if (fd < 0) {
 								                        r = -errno;
 								                        goto child_fail;
 								                }
 								                if (write(fd, gid_map, strlen(gid_map)) < 0) {
 								                        r = -errno;
 								                        goto child_fail;
 								                }
 								                fd = safe_close(fd);
 								                /* The write the UID map */
 								                a = procfs_file_alloca(ppid, "uid_map");
 								                fd = open(a, O_WRONLY|O_CLOEXEC);
 								                if (fd < 0) {
 								                        r = -errno;
 								                        goto child_fail;
 								                }
 								                if (write(fd, uid_map, strlen(uid_map)) < 0) {
 								                        r = -errno;
 								                        goto child_fail;
 								                }
 								                _exit(EXIT_SUCCESS);
 								        child_fail:
 								                (void) write(errno_pipe[1], &r, sizeof(r));
 								                _exit(EXIT_FAILURE);
 								        }
 								        errno_pipe[1] = safe_close(errno_pipe[1]);
 								        if (unshare(CLONE_NEWUSER) < 0)
 								                return -errno;
 								        /* Let the child know that the namespace is ready now */
 								        if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
 								                return -errno;
 								        /* Try to read an error code from the child */
 								        n = read(errno_pipe[0], &r, sizeof(r));
 								        if (n < 0)
 								                return -errno;
 								        if (n == sizeof(r)) { /* an error code was sent to us */
 								                if (r < 0)
 								                        return r;
 								                return -EIO;
 								        }
 								        if (n != 0) /* on success we should have read 0 bytes */
 								                return -EIO;
-												tree-wide: make use of wait_for_terminate_and_check() at various places

Using wait_for_terminate_and_check() instead of wait_for_terminate()
let's us simplify, shorten and unify the return value checking and
logging of waitid().  Hence, let's use it all over the place.

											
										
										
											2017-12-29 18:07:00 +01:00
+								        r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
 								        pid = 0;
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								        if (r < 0)
 								                return r;
-												tree-wide: make use of wait_for_terminate_and_check() at various places

Using wait_for_terminate_and_check() instead of wait_for_terminate()
let's us simplify, shorten and unify the return value checking and
logging of waitid().  Hence, let's use it all over the place.

											
										
										
											2017-12-29 18:07:00 +01:00
+								        if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								                return -EIO;
 								        return 0;
 								}
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								static int setup_exec_directory(
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
+								                const ExecContext *context,
 								                const ExecParameters *params,
 								                uid_t uid,
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                gid_t gid,
 								                ExecDirectoryType type,
 								                int *exit_status) {
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								        static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
 								                [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
 								                [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
 								                [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
 								                [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
 								        };
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
+								        char **rt;
 								        int r;
 								        assert(context);
 								        assert(params);
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								        assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        assert(exit_status);
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        if (!params->prefix[type])
 								                return 0;
-												execute: add one more ExecFlags flag, for controlling unconditional directory chowning

Let's decouple the Manager object from the execution logic a bit more
here too, and simply pass along the fact whether we should
unconditionally chown the runtime/... directories via the ExecFlags
field too.

											
										
										
											2017-08-01 10:35:10 +02:00
+								        if (params->flags & EXEC_CHOWN_DIRECTORIES) {
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                if (!uid_is_valid(uid))
 								                        uid = 0;
 								                if (!gid_is_valid(gid))
 								                        gid = 0;
 								        }
 								        STRV_FOREACH(rt, context->directories[type].paths) {
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                _cleanup_free_ char *p = NULL, *pp = NULL;
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                p = strjoin(params->prefix[type], "/", *rt);
 								                if (!p) {
 								                        r = -ENOMEM;
 								                        goto fail;
 								                }
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
-												core: support subdirectories in RuntimeDirectory= option

											
										
										
											2017-07-17 09:30:53 +02:00
+								                r = mkdir_parents_label(p, 0755);
 								                if (r < 0)
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                        goto fail;
-												core: support subdirectories in RuntimeDirectory= option

											
										
										
											2017-07-17 09:30:53 +02:00
-												core/execute: do not create RuntimeDirectory= under private/ sub-directory

RuntimeDirectory= often used for sharing files or sockets with other
services. So, if creating them under private/ sub-directory, we cannot
set DynamicUser= to service units which want to share something through
RuntimeDirectory=.
This makes the directories given by RuntimeDirectory= are created under
/run/ even if DynamicUser= is set.

Fixes #7260.

											
										
										
											2017-11-08 07:50:58 +01:00
+								                if (context->dynamic_user &&
 								                    !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                        _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
 								                        /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
 								                         * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
 								                         * whose UID is later on reused. To lock this down we use the same trick used by container
 								                         * managers to prohibit host users to get access to files of the same UID in containers: we
 								                         * place everything inside a directory that has an access mode of 0700 and is owned root:root,
 								                         * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
 								                         * to make this directory permeable for the service itself.
 								                         *
 								                         * Specifically: for a service which wants a special directory "foo/" we first create a
 								                         * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
 								                         * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
 								                         * privileged host users can access "foo/" as usual, but unprivileged host users can't look
 								                         * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
 								                         * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
 								                         * disabling the access boundary for the service and making sure it only gets access to the
 								                         * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
 								                         *
 								                         * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
-												core/execute: do not create RuntimeDirectory= under private/ sub-directory

RuntimeDirectory= often used for sharing files or sockets with other
services. So, if creating them under private/ sub-directory, we cannot
set DynamicUser= to service units which want to share something through
RuntimeDirectory=.
This makes the directories given by RuntimeDirectory= are created under
/run/ even if DynamicUser= is set.

Fixes #7260.

											
										
										
											2017-11-08 07:50:58 +01:00
+								                         * owned by the service itself.
 								                         * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
 								                         * files or sockets with other services. */
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
 								                        private_root = strjoin(params->prefix[type], "/private");
 								                        if (!private_root) {
 								                                r = -ENOMEM;
 								                                goto fail;
 								                        }
 								                        /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
-												tree-wide: warn when a directory path already exists but has bad mode/owner/type

When we are attempting to create directory somewhere in the bowels of /var/lib
and get an error that it already exists, it can be quite hard to diagnose what
is wrong (especially for a user who is not aware that the directory must have
the specified owner, and permissions not looser than what was requested). Let's
print a warning in most cases. A warning is appropriate, because such state is
usually a sign of borked installation and needs to be resolved by the adminstrator.

$ build/test-fs-util

Path "/tmp/test-readlink_and_make_absolute" already exists and is not a directory, refusing.
   (or)
Directory "/tmp/test-readlink_and_make_absolute" already exists, but has mode 0775 that is too permissive (0755 was requested), refusing.
   (or)
Directory "/tmp/test-readlink_and_make_absolute" already exists, but is owned by 1001:1000 (1000:1000 was requested), refusing.

Assertion 'mkdir_safe(tempdir, 0755, getuid(), getgid(), MKDIR_WARN_MODE) >= 0' failed at ../src/test/test-fs-util.c:320, function test_readlink_and_make_absolute(). Aborting.

No functional change except for the new log lines.

											
										
										
											2018-03-22 13:03:41 +01:00
+								                        r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                        if (r < 0)
 								                                goto fail;
 								                        pp = strjoin(private_root, "/", *rt);
 								                        if (!pp) {
 								                                r = -ENOMEM;
 								                                goto fail;
 								                        }
 								                        /* Create all directories between the configured directory and this private root, and mark them 0755 */
 								                        r = mkdir_parents_label(pp, 0755);
 								                        if (r < 0)
 								                                goto fail;
-												core: support upgrading from DynamicUser=0 to DynamicUser=1 for unit directories (#7507)

This makes sure we migrate /var/lib/<foo> if it exists to
/var/lib/private/<foo> if DynamicUser=1 is set. This is useful to allow
turning on DynamicUser= on services that previously didn't use it, and
we can deal with this, and migrate the relevant directories as
necessary.

Note that "downgrading" from DynamicUser=1 backto DynamicUser=0 works
too. However in that case we simply continue to use
/var/lib/private/<foo>, which works because /var/lib/<foo> is a symlink
there after all.
											
										
										
											2017-11-30 11:52:39 +01:00
+								                        if (is_dir(p, false) > 0 &&
 								                            (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
 								                                /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
 								                                 * it over. Most likely the service has been upgraded from one that didn't use
 								                                 * DynamicUser=1, to one that does. */
 								                                if (rename(p, pp) < 0) {
 								                                        r = -errno;
 								                                        goto fail;
 								                                }
 								                        } else {
 								                                /* Otherwise, create the actual directory for the service */
 								                                r = mkdir_label(pp, context->directories[type].mode);
 								                                if (r < 0 && r != -EEXIST)
 								                                        goto fail;
 								                        }
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
 								                        parent = dirname_malloc(p);
 								                        if (!parent) {
 								                                r = -ENOMEM;
 								                                goto fail;
 								                        }
 								                        r = path_make_relative(parent, pp, &relative);
 								                        if (r < 0)
 								                                goto fail;
 								                        /* And link it up from the original place */
 								                        r = symlink_idempotent(relative, p);
 								                        if (r < 0)
 								                                goto fail;
-												pid1: when creating service directories, don't chown existing files (#8181)

This partially reverts 3536f49e8fa281539798a7bc5004d73302f39673 and
3536f49e8fa281539798a7bc5004d73302f39673.

When the user is dynamic, and we are setting up state, cache, or logs dirs,
behaviour is unchanged, we always do a recursive chown. This is necessary
because the user number might change between invocations.

But when setting up a directory for non-dynamic user, or a runtime directory
for a dynamic user, do any ownership or mode changes only when the directory
is initially created. Nothing says that the files under those directories have
to be all recursively owned by our user. This restores behaviour before
3536f49e8fa281539798a7bc5004d73302f39673, so modifications to the state of
the runtime directory persist between ExecStartPre's and ExecStart's, and even
longer in case the directory is persistent.

I think it _would_ be a nice property if setting a user would automatically
propagate to ownership of any Runtime/Logs/Cache directories. But this is
incompatible with another nice property, namely preserving changes to those
directories made by an admin, and with allowing change of ownership of files
in those directories by the service (e.g. to allow other users to access them).
Of the two, I think the second property is more important. Also, it's backwards
compatible.

https://bugzilla.redhat.com/show_bug.cgi?id=1508495

There is no need to chmod a directory we just created, so move that step
up into a branch. After that, 'effective' is only used once, so get rid of
it too.
											
										
										
											2018-02-22 11:30:59 +01:00
+								                        /* Lock down the access mode */
 								                        if (chmod(pp, context->directories[type].mode) < 0) {
 								                                r = -errno;
 								                                goto fail;
 								                        }
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                } else {
 								                        r = mkdir_label(p, context->directories[type].mode);
-												core: chown RuntimeDirectory= if DynamicUser= is set

When DynamicUser= is set, then RuntimeDirectory= should be always
chowned, as the service unit may enable RuntimeDirectoryPreserve=,
and the uid or gid may changed from the last run.
This also makes easier to migrate the service to use DynamicUser=.

											
										
										
											2018-05-22 15:26:22 +02:00
+								                        if (r < 0 && r != -EEXIST)
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                                goto fail;
-												core: chown RuntimeDirectory= if DynamicUser= is set

When DynamicUser= is set, then RuntimeDirectory= should be always
chowned, as the service unit may enable RuntimeDirectoryPreserve=,
and the uid or gid may changed from the last run.
This also makes easier to migrate the service to use DynamicUser=.

											
										
										
											2018-05-22 15:26:22 +02:00
+								                        if (r == -EEXIST && !context->dynamic_user)
 								                                continue;
-												core: chown() StateDirectory= and friends recursively when starting a service

This is particularly useful when used in conjunction with DynamicUser=1,
where the UID might change for every invocation, but is useful in other
cases too, for example, when these directories are shared between
systems where the UID assignments differ slightly.

											
										
										
											2017-09-28 19:13:44 +02:00
+								                }
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
-												core: don't chown() the configuration directory

The configuration directory is commonly not owned by a service, but
remains root-owned, hence don't change the owner automatically for it.

											
										
										
											2017-08-01 10:36:33 +02:00
+								                /* Don't change the owner of the configuration directory, as in the common case it is not written to by
 								                 * a service, and shall not be writable. */
 								                if (type == EXEC_DIRECTORY_CONFIGURATION)
 								                        continue;
-												core: chown() StateDirectory= and friends recursively when starting a service

This is particularly useful when used in conjunction with DynamicUser=1,
where the UID might change for every invocation, but is useful in other
cases too, for example, when these directories are shared between
systems where the UID assignments differ slightly.

											
										
										
											2017-09-28 19:13:44 +02:00
+								                /* Then, change the ownership of the whole tree, if necessary */
-												pid1: when creating service directories, don't chown existing files (#8181)

This partially reverts 3536f49e8fa281539798a7bc5004d73302f39673 and
3536f49e8fa281539798a7bc5004d73302f39673.

When the user is dynamic, and we are setting up state, cache, or logs dirs,
behaviour is unchanged, we always do a recursive chown. This is necessary
because the user number might change between invocations.

But when setting up a directory for non-dynamic user, or a runtime directory
for a dynamic user, do any ownership or mode changes only when the directory
is initially created. Nothing says that the files under those directories have
to be all recursively owned by our user. This restores behaviour before
3536f49e8fa281539798a7bc5004d73302f39673, so modifications to the state of
the runtime directory persist between ExecStartPre's and ExecStart's, and even
longer in case the directory is persistent.

I think it _would_ be a nice property if setting a user would automatically
propagate to ownership of any Runtime/Logs/Cache directories. But this is
incompatible with another nice property, namely preserving changes to those
directories made by an admin, and with allowing change of ownership of files
in those directories by the service (e.g. to allow other users to access them).
Of the two, I think the second property is more important. Also, it's backwards
compatible.

https://bugzilla.redhat.com/show_bug.cgi?id=1508495

There is no need to chmod a directory we just created, so move that step
up into a branch. After that, 'effective' is only used once, so get rid of
it too.
											
										
										
											2018-02-22 11:30:59 +01:00
+								                r = path_chown_recursive(pp ?: p, uid, gid);
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
+								                if (r < 0)
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                        goto fail;
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
+								        }
 								        return 0;
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
 								fail:
 								        *exit_status = exit_status_table[type];
 								        return r;
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
+								}
-												execute: define setup_smack() only if SMACK is enabled

This suppresses the following warning
```
execute.c:2149:12: warning: ‘setup_smack’ defined but not used [-Wunused-function]
 static int setup_smack(
            ^~~~~~~~~~~
```

											
										
										
											2017-12-05 06:04:12 +01:00
+								#if ENABLE_SMACK
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
+								static int setup_smack(
 								                const ExecContext *context,
 								                const ExecCommand *command) {
 								        int r;
 								        assert(context);
 								        assert(command);
 								        if (context->smack_process_label) {
 								                r = mac_smack_apply_pid(0, context->smack_process_label);
 								                if (r < 0)
 								                        return r;
 								        }
 								#ifdef SMACK_DEFAULT_PROCESS_LABEL
 								        else {
 								                _cleanup_free_ char *exec_label = NULL;
 								                r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
-												tree-wide: use IN_SET macro (#6977)


											
										
										
											2017-10-04 16:01:32 +02:00
+								                if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
+								                        return r;
 								                r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
 								                if (r < 0)
 								                        return r;
 								        }
 								#endif
 								        return 0;
 								}
-												execute: define setup_smack() only if SMACK is enabled

This suppresses the following warning
```
execute.c:2149:12: warning: ‘setup_smack’ defined but not used [-Wunused-function]
 static int setup_smack(
            ^~~~~~~~~~~
```

											
										
										
											2017-12-05 06:04:12 +01:00
+								#endif
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								static int compile_bind_mounts(
 								                const ExecContext *context,
 								                const ExecParameters *params,
 								                BindMount **ret_bind_mounts,
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								                size_t *ret_n_bind_mounts,
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                char ***ret_empty_directories) {
 								        _cleanup_strv_free_ char **empty_directories = NULL;
 								        BindMount *bind_mounts;
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								        size_t n, h = 0, i;
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								        ExecDirectoryType t;
 								        int r;
 								        assert(context);
 								        assert(params);
 								        assert(ret_bind_mounts);
 								        assert(ret_n_bind_mounts);
 								        assert(ret_empty_directories);
 								        n = context->n_bind_mounts;
 								        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
 								                if (!params->prefix[t])
 								                        continue;
 								                n += strv_length(context->directories[t].paths);
 								        }
 								        if (n <= 0) {
 								                *ret_bind_mounts = NULL;
 								                *ret_n_bind_mounts = 0;
 								                *ret_empty_directories = NULL;
 								                return 0;
 								        }
 								        bind_mounts = new(BindMount, n);
 								        if (!bind_mounts)
 								                return -ENOMEM;
-												core: fix segfault in compile_bind_mounts() when BindPaths= or BindReadOnlyPaths= is set

This fixes a bug introduced by 6c47cd7d3bf35c8158a0737f34fe2c5dc95e72d6.

Fixes #7055.

											
										
										
											2017-10-11 05:27:13 +02:00
+								        for (i = 0; i < context->n_bind_mounts; i++) {
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                BindMount *item = context->bind_mounts + i;
 								                char *s, *d;
 								                s = strdup(item->source);
 								                if (!s) {
 								                        r = -ENOMEM;
 								                        goto finish;
 								                }
 								                d = strdup(item->destination);
 								                if (!d) {
 								                        free(s);
 								                        r = -ENOMEM;
 								                        goto finish;
 								                }
 								                bind_mounts[h++] = (BindMount) {
 								                        .source = s,
 								                        .destination = d,
 								                        .read_only = item->read_only,
 								                        .recursive = item->recursive,
 								                        .ignore_enoent = item->ignore_enoent,
 								                };
 								        }
 								        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
 								                char **suffix;
 								                if (!params->prefix[t])
 								                        continue;
 								                if (strv_isempty(context->directories[t].paths))
 								                        continue;
-												core/execute: do not create RuntimeDirectory= under private/ sub-directory

RuntimeDirectory= often used for sharing files or sockets with other
services. So, if creating them under private/ sub-directory, we cannot
set DynamicUser= to service units which want to share something through
RuntimeDirectory=.
This makes the directories given by RuntimeDirectory= are created under
/run/ even if DynamicUser= is set.

Fixes #7260.

											
										
										
											2017-11-08 07:50:58 +01:00
+								                if (context->dynamic_user &&
-												core: make StateDirectory= or friends works with DynamicUser= and RootDirectory=/RootImage=

The symbolic links to private directories specified by StateDirectory=
or its friends are created on the host. So, when DynamicUser= and
RootDirectory=/RootImage= are set, then the executed process cannot
access private directory.
This makes the private directories are mounted on the non-private place
when both DynamicUser= and RootDirectory=/RootImage= are set.

Fixes #8965.

											
										
										
											2018-05-25 10:25:17 +02:00
+								                    !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
 								                    !(context->root_directory || context->root_image)) {
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                        char *private_root;
 								                        /* So this is for a dynamic user, and we need to make sure the process can access its own
 								                         * directory. For that we overmount the usually inaccessible "private" subdirectory with a
 								                         * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
 								                        private_root = strjoin(params->prefix[t], "/private");
 								                        if (!private_root) {
 								                                r = -ENOMEM;
 								                                goto finish;
 								                        }
 								                        r = strv_consume(&empty_directories, private_root);
-												core/execute: simplify compile_bind_mounts()

It is not necessary to re-assign error code.

											
										
										
											2018-02-12 08:16:58 +01:00
+								                        if (r < 0)
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                                goto finish;
 								                }
 								                STRV_FOREACH(suffix, context->directories[t].paths) {
 								                        char *s, *d;
-												core/execute: do not create RuntimeDirectory= under private/ sub-directory

RuntimeDirectory= often used for sharing files or sockets with other
services. So, if creating them under private/ sub-directory, we cannot
set DynamicUser= to service units which want to share something through
RuntimeDirectory=.
This makes the directories given by RuntimeDirectory= are created under
/run/ even if DynamicUser= is set.

Fixes #7260.

											
										
										
											2017-11-08 07:50:58 +01:00
+								                        if (context->dynamic_user &&
 								                            !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                                s = strjoin(params->prefix[t], "/private/", *suffix);
 								                        else
 								                                s = strjoin(params->prefix[t], "/", *suffix);
 								                        if (!s) {
 								                                r = -ENOMEM;
 								                                goto finish;
 								                        }
-												core: make StateDirectory= or friends works with DynamicUser= and RootDirectory=/RootImage=

The symbolic links to private directories specified by StateDirectory=
or its friends are created on the host. So, when DynamicUser= and
RootDirectory=/RootImage= are set, then the executed process cannot
access private directory.
This makes the private directories are mounted on the non-private place
when both DynamicUser= and RootDirectory=/RootImage= are set.

Fixes #8965.

											
										
										
											2018-05-25 10:25:17 +02:00
+								                        if (context->dynamic_user &&
 								                            !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
 								                            (context->root_directory || context->root_image))
 								                                /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
 								                                 * directory is not created on the root directory. So, let's bind-mount the directory
 								                                 * on the 'non-private' place. */
 								                                d = strjoin(params->prefix[t], "/", *suffix);
 								                        else
 								                                d = strdup(s);
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                        if (!d) {
 								                                free(s);
 								                                r = -ENOMEM;
 								                                goto finish;
 								                        }
 								                        bind_mounts[h++] = (BindMount) {
 								                                .source = s,
 								                                .destination = d,
 								                                .read_only = false,
 								                                .recursive = true,
 								                                .ignore_enoent = false,
 								                        };
 								                }
 								        }
 								        assert(h == n);
 								        *ret_bind_mounts = bind_mounts;
 								        *ret_n_bind_mounts = n;
-												macro: introduce TAKE_PTR() macro

This macro will read a pointer of any type, return it, and set the
pointer to NULL. This is useful as an explicit concept of passing
ownership of a memory area between pointers.

This takes inspiration from Rust:

https://doc.rust-lang.org/std/option/enum.Option.html#method.take

and was suggested by Alan Jenkins (@sourcejedi).

It drops ~160 lines of code from our codebase, which makes me like it.
Also, I think it clarifies passing of ownership, and thus helps
readability a bit (at least for the initiated who know the new macro)

											
										
										
											2018-03-22 16:53:26 +01:00
+								        *ret_empty_directories = TAKE_PTR(empty_directories);
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
 								        return (int) n;
 								finish:
 								        bind_mount_free_many(bind_mounts, h);
 								        return r;
 								}
-												core: skip ReadOnlyPaths= and other permission-related mounts on PermissionsStartOnly= (#5309)

ReadOnlyPaths=, ProtectHome=, InaccessiblePaths= and ProtectSystem= are
about restricting access and little more, hence they should be disabled
if PermissionsStartOnly= is used or ExecStart= lines are prefixed with a
"+". Do that.

(Note that we will still create namespaces and stuff, since that's about
a lot more than just permissions. We'll simply disable the effect of
the four options mentioned above, but nothing else mount related.)

This also adds a test for this, to ensure this works as intended.

No documentation updates, as the documentation are already vague enough
to support the new behaviour ("If true, the permission-related execution
options…"). We could clarify this further, but I think we might want to
extend the switches' behaviour a bit more in future, hence leave it at
this for now.

Fixes: #5308
											
										
										
											2017-02-12 06:44:46 +01:00
+								static int apply_mount_namespace(
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								                const Unit *u,
 								                const ExecCommand *command,
-												core: skip ReadOnlyPaths= and other permission-related mounts on PermissionsStartOnly= (#5309)

ReadOnlyPaths=, ProtectHome=, InaccessiblePaths= and ProtectSystem= are
about restricting access and little more, hence they should be disabled
if PermissionsStartOnly= is used or ExecStart= lines are prefixed with a
"+". Do that.

(Note that we will still create namespaces and stuff, since that's about
a lot more than just permissions. We'll simply disable the effect of
the four options mentioned above, but nothing else mount related.)

This also adds a test for this, to ensure this works as intended.

No documentation updates, as the documentation are already vague enough
to support the new behaviour ("If true, the permission-related execution
options…"). We could clarify this further, but I think we might want to
extend the switches' behaviour a bit more in future, hence leave it at
this for now.

Fixes: #5308
											
										
										
											2017-02-12 06:44:46 +01:00
+								                const ExecContext *context,
 								                const ExecParameters *params,
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								                const ExecRuntime *runtime) {
-												core: skip ReadOnlyPaths= and other permission-related mounts on PermissionsStartOnly= (#5309)

ReadOnlyPaths=, ProtectHome=, InaccessiblePaths= and ProtectSystem= are
about restricting access and little more, hence they should be disabled
if PermissionsStartOnly= is used or ExecStart= lines are prefixed with a
"+". Do that.

(Note that we will still create namespaces and stuff, since that's about
a lot more than just permissions. We'll simply disable the effect of
the four options mentioned above, but nothing else mount related.)

This also adds a test for this, to ensure this works as intended.

No documentation updates, as the documentation are already vague enough
to support the new behaviour ("If true, the permission-related execution
options…"). We could clarify this further, but I think we might want to
extend the switches' behaviour a bit more in future, hence leave it at
this for now.

Fixes: #5308
											
										
										
											2017-02-12 06:44:46 +01:00
-												core: remove compile_read_write_paths()

From 6c47cd7d3bf35c8158a0737f34fe2c5dc95e72d6, RuntimeDirectory= and
their friends also imply BindPaths=. Thus, implying ReadWritePaths=
is meaningless.

											
										
										
											2017-10-13 14:13:25 +02:00
+								        _cleanup_strv_free_ char **empty_directories = NULL;
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
+								        char *tmp = NULL, *var = NULL;
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								        const char *root_dir = NULL, *root_image = NULL;
-												core: add new PrivateMounts= unit setting

This new setting is supposed to be useful in most cases where
"MountFlags=slave" is currently used, i.e. as an explicit way to run a
service in its own mount namespace and decouple propagation from all
mounts of the new mount namespace towards the host.

The effect of MountFlags=slave and PrivateMounts=yes is mostly the same,
as both cause a CLONE_NEWNS namespace to be opened, and both will result
in all mounts within it to be mounted MS_SLAVE. The difference is mostly
on the conceptual/philosophical level: configuring the propagation mode
is nothing people should have to think about, in particular as the
matter is not precisely easyto grok. Moreover, MountFlags= allows configuration
of "private" and "slave" modes which don't really make much sense to use
in real-life and are quite confusing. In particular PrivateMounts=private means
mounts made on the host stay pinned for good by the service which is
particularly nasty for removable media mount. And PrivateMounts=shared
is in most ways a NOP when used a alone...

The main technical difference between setting only MountFlags=slave or
only PrivateMounts=yes in a unit file is that the former remounts all
mounts to MS_SLAVE and leaves them there, while that latter remounts
them to MS_SHARED again right after. The latter is generally a nicer
approach, since it disables propagation, while MS_SHARED is afterwards
in effect, which is really nice as that means further namespacing down
the tree will get MS_SHARED logic by default and we unify how
applications see our mounts as we always pass them as MS_SHARED
regardless whether any mount namespacing is used or not.

The effect of PrivateMounts=yes was implied already by all the other
mount namespacing options. With this new option we add an explicit knob
for it, to request it without any other option used as well.

See: #4393

											
										
										
											2018-06-01 11:10:49 +02:00
+								        NamespaceInfo ns_info;
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        bool needs_sandboxing;
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								        BindMount *bind_mounts = NULL;
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								        size_t n_bind_mounts = 0;
-												core: skip ReadOnlyPaths= and other permission-related mounts on PermissionsStartOnly= (#5309)

ReadOnlyPaths=, ProtectHome=, InaccessiblePaths= and ProtectSystem= are
about restricting access and little more, hence they should be disabled
if PermissionsStartOnly= is used or ExecStart= lines are prefixed with a
"+". Do that.

(Note that we will still create namespaces and stuff, since that's about
a lot more than just permissions. We'll simply disable the effect of
the four options mentioned above, but nothing else mount related.)

This also adds a test for this, to ensure this works as intended.

No documentation updates, as the documentation are already vague enough
to support the new behaviour ("If true, the permission-related execution
options…"). We could clarify this further, but I think we might want to
extend the switches' behaviour a bit more in future, hence leave it at
this for now.

Fixes: #5308
											
										
										
											2017-02-12 06:44:46 +01:00
+								        int r;
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
+								        assert(context);
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
+								        /* The runtime struct only contains the parent of the private /tmp,
 								         * which is non-accessible to world users. Inside of it there's a /tmp
 								         * that is sticky, and that's the one we want to use here. */
 								        if (context->private_tmp && runtime) {
 								                if (runtime->tmp_dir)
 								                        tmp = strjoina(runtime->tmp_dir, "/tmp");
 								                if (runtime->var_tmp_dir)
 								                        var = strjoina(runtime->var_tmp_dir, "/tmp");
 								        }
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								        if (params->flags & EXEC_APPLY_CHROOT) {
 								                root_image = context->root_image;
 								                if (!root_image)
 								                        root_dir = context->root_directory;
 								        }
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								        r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
 								        if (r < 0)
 								                return r;
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
-												core: disable namespace sandboxing for '+' prefixed lines

Fixes #8842.

											
										
										
											2018-05-01 06:33:34 +02:00
+								        if (needs_sandboxing)
 								                ns_info = (NamespaceInfo) {
 								                        .ignore_protect_paths = false,
 								                        .private_dev = context->private_devices,
 								                        .protect_control_groups = context->protect_control_groups,
 								                        .protect_kernel_tunables = context->protect_kernel_tunables,
 								                        .protect_kernel_modules = context->protect_kernel_modules,
 								                        .mount_apivfs = context->mount_apivfs,
-												core: add new PrivateMounts= unit setting

This new setting is supposed to be useful in most cases where
"MountFlags=slave" is currently used, i.e. as an explicit way to run a
service in its own mount namespace and decouple propagation from all
mounts of the new mount namespace towards the host.

The effect of MountFlags=slave and PrivateMounts=yes is mostly the same,
as both cause a CLONE_NEWNS namespace to be opened, and both will result
in all mounts within it to be mounted MS_SLAVE. The difference is mostly
on the conceptual/philosophical level: configuring the propagation mode
is nothing people should have to think about, in particular as the
matter is not precisely easyto grok. Moreover, MountFlags= allows configuration
of "private" and "slave" modes which don't really make much sense to use
in real-life and are quite confusing. In particular PrivateMounts=private means
mounts made on the host stay pinned for good by the service which is
particularly nasty for removable media mount. And PrivateMounts=shared
is in most ways a NOP when used a alone...

The main technical difference between setting only MountFlags=slave or
only PrivateMounts=yes in a unit file is that the former remounts all
mounts to MS_SLAVE and leaves them there, while that latter remounts
them to MS_SHARED again right after. The latter is generally a nicer
approach, since it disables propagation, while MS_SHARED is afterwards
in effect, which is really nice as that means further namespacing down
the tree will get MS_SHARED logic by default and we unify how
applications see our mounts as we always pass them as MS_SHARED
regardless whether any mount namespacing is used or not.

The effect of PrivateMounts=yes was implied already by all the other
mount namespacing options. With this new option we add an explicit knob
for it, to request it without any other option used as well.

See: #4393

											
										
										
											2018-06-01 11:10:49 +02:00
+								                        .private_mounts = context->private_mounts,
-												core: disable namespace sandboxing for '+' prefixed lines

Fixes #8842.

											
										
										
											2018-05-01 06:33:34 +02:00
+								                };
-												core: add new PrivateMounts= unit setting

This new setting is supposed to be useful in most cases where
"MountFlags=slave" is currently used, i.e. as an explicit way to run a
service in its own mount namespace and decouple propagation from all
mounts of the new mount namespace towards the host.

The effect of MountFlags=slave and PrivateMounts=yes is mostly the same,
as both cause a CLONE_NEWNS namespace to be opened, and both will result
in all mounts within it to be mounted MS_SLAVE. The difference is mostly
on the conceptual/philosophical level: configuring the propagation mode
is nothing people should have to think about, in particular as the
matter is not precisely easyto grok. Moreover, MountFlags= allows configuration
of "private" and "slave" modes which don't really make much sense to use
in real-life and are quite confusing. In particular PrivateMounts=private means
mounts made on the host stay pinned for good by the service which is
particularly nasty for removable media mount. And PrivateMounts=shared
is in most ways a NOP when used a alone...

The main technical difference between setting only MountFlags=slave or
only PrivateMounts=yes in a unit file is that the former remounts all
mounts to MS_SLAVE and leaves them there, while that latter remounts
them to MS_SHARED again right after. The latter is generally a nicer
approach, since it disables propagation, while MS_SHARED is afterwards
in effect, which is really nice as that means further namespacing down
the tree will get MS_SHARED logic by default and we unify how
applications see our mounts as we always pass them as MS_SHARED
regardless whether any mount namespacing is used or not.

The effect of PrivateMounts=yes was implied already by all the other
mount namespacing options. With this new option we add an explicit knob
for it, to request it without any other option used as well.

See: #4393

											
										
										
											2018-06-01 11:10:49 +02:00
+								        else if (!context->dynamic_user && root_dir)
 								                /*
 								                 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
 								                 * sandbox info, otherwise enforce it, don't ignore protected paths and
 								                 * fail if we are enable to apply the sandbox inside the mount namespace.
 								                 */
 								                ns_info = (NamespaceInfo) {
 								                        .ignore_protect_paths = true,
 								                };
 								        else
 								                ns_info = (NamespaceInfo) {};
-												core: disable namespace sandboxing for '+' prefixed lines

Fixes #8842.

											
										
										
											2018-05-01 06:33:34 +02:00
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								        r = setup_namespace(root_dir, root_image,
-												core: remove compile_read_write_paths()

From 6c47cd7d3bf35c8158a0737f34fe2c5dc95e72d6, RuntimeDirectory= and
their friends also imply BindPaths=. Thus, implying ReadWritePaths=
is meaningless.

											
										
										
											2017-10-13 14:13:25 +02:00
+								                            &ns_info, context->read_write_paths,
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                            needs_sandboxing ? context->read_only_paths : NULL,
 								                            needs_sandboxing ? context->inaccessible_paths : NULL,
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                            empty_directories,
 								                            bind_mounts,
 								                            n_bind_mounts,
-												core: add new setting TemporaryFileSystem=

This introduces a new setting TemporaryFileSystem=. This is useful
to hide files not relevant to the processes invoked by unit, while
necessary files or directories can be still accessed by combining
with Bind{,ReadOnly}Paths=.

											
										
										
											2018-02-21 01:17:52 +01:00
+								                            context->temporary_filesystems,
 								                            context->n_temporary_filesystems,
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
+								                            tmp,
 								                            var,
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                            needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
 								                            needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								                            context->mount_flags,
 								                            DISSECT_IMAGE_DISCARD_ON_LOOP);
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								        bind_mount_free_many(bind_mounts, n_bind_mounts);
-												namespace: be more careful when handling namespacing failures gracefully

This makes two changes to the namespacing code:

1. We'll only gracefully skip service namespacing on access failure if
   exclusively sandboxing options where selected, and not mount-related
   options that result in a very different view of the world. For example,
   ignoring RootDirectory=, RootImage= or Bind= is really probablematic,
   but ReadOnlyPaths= is just a weaker sandbox.

2. The namespacing code will now return a clearly recognizable error
   code when it cannot enforce its namespacing, so that we cannot
   confuse EPERM errors from mount() with those from unshare(). Only the
   errors from the first unshare() are now taken as hint to gracefully
   disable namespacing.

Fixes: #9844 #9835

											
										
										
											2018-08-10 15:07:14 +02:00
+								        /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
 								         * that with a special, recognizable error ENOANO. In this case, silently proceeed, but only if exclusively
 								         * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
 								         * completely different execution environment. */
 								        if (r == -ENOANO &&
 								            n_bind_mounts == 0 && context->n_temporary_filesystems == 0 &&
 								            !root_dir && !root_image &&
 								            !context->dynamic_user) {
 								                log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
-												execute: drop explicit log_open()/log_close() now that it is unnecessary

											
										
										
											2017-09-26 17:41:53 +02:00
+								                return 0;
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
+								        }
 								        return r;
 								}
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								static int apply_working_directory(
 								                const ExecContext *context,
 								                const ExecParameters *params,
 								                const char *home,
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								                const bool needs_mount_ns,
 								                int *exit_status) {
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								        const char *d, *wd;
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
 								        assert(context);
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								        assert(exit_status);
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								        if (context->working_directory_home) {
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								                if (!home) {
 								                        *exit_status = EXIT_CHDIR;
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								                        return -ENXIO;
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								                }
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
+								                wd = home;
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
 								        } else if (context->working_directory)
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
+								                wd = context->working_directory;
 								        else
 								                wd = "/";
-												core: move apply working directory code into its own apply_working_directory()

											
										
										
											2016-10-27 09:21:44 +02:00
 								        if (params->flags & EXEC_APPLY_CHROOT) {
 								                if (!needs_mount_ns && context->root_directory)
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								                        if (chroot(context->root_directory) < 0) {
 								                                *exit_status = EXIT_CHROOT;
-												core: move apply working directory code into its own apply_working_directory()

											
										
										
											2016-10-27 09:21:44 +02:00
+								                                return -errno;
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								                        }
-												core: move apply working directory code into its own apply_working_directory()

											
										
										
											2016-10-27 09:21:44 +02:00
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
+								                d = wd;
 								        } else
-												execute: use prefix_roota() where appropriate

											
										
										
											2017-02-09 13:16:51 +01:00
+								                d = prefix_roota(context->root_directory, wd);
-												core: move apply working directory code into its own apply_working_directory()

											
										
										
											2016-10-27 09:21:44 +02:00
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								        if (chdir(d) < 0 && !context->working_directory_missing_ok) {
 								                *exit_status = EXIT_CHDIR;
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
+								                return -errno;
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								        }
-												core: move apply working directory code into its own apply_working_directory()

											
										
										
											2016-10-27 09:21:44 +02:00
 								        return 0;
 								}
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								static int setup_keyring(
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								                const Unit *u,
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								                const ExecContext *context,
 								                const ExecParameters *p,
 								                uid_t uid, gid_t gid) {
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        key_serial_t keyring;
-												core: use setreuid/setregid trick to create session keyring with right ownership (#8447)

Re-use the hacks used to link user keyring, when creating the session
keyring. This way changing ownership of the keyring is not required, and thus
incovation_id can be correctly created in restricted environments.

Creating invocation_id with root permissions works and linking it into session
keyring works, as at that point session keyring is possessed.

Simple way to validate this is with following commands:

$ journalctl -f &
$ sudo systemd-run --uid 1000 /bin/sh -c 'keyctl describe @s; keyctl list @s; keyctl read `keyctl search @s user invocation_id`'

which now works in LXD containers as well as on the host.

Fixes: https://github.com/systemd/systemd/issues/7655
											
										
										
											2018-03-27 12:58:10 +02:00
+								        int r = 0;
 								        uid_t saved_uid;
 								        gid_t saved_gid;
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
 								        assert(u);
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								        assert(context);
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        assert(p);
 								        /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
 								         * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
 								         * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
 								         * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
 								         * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
 								         * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
 								        if (!(p->flags & EXEC_NEW_KEYRING))
 								                return 0;
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								        if (context->keyring_mode == EXEC_KEYRING_INHERIT)
 								                return 0;
-												core: use setreuid/setregid trick to create session keyring with right ownership (#8447)

Re-use the hacks used to link user keyring, when creating the session
keyring. This way changing ownership of the keyring is not required, and thus
incovation_id can be correctly created in restricted environments.

Creating invocation_id with root permissions works and linking it into session
keyring works, as at that point session keyring is possessed.

Simple way to validate this is with following commands:

$ journalctl -f &
$ sudo systemd-run --uid 1000 /bin/sh -c 'keyctl describe @s; keyctl list @s; keyctl read `keyctl search @s user invocation_id`'

which now works in LXD containers as well as on the host.

Fixes: https://github.com/systemd/systemd/issues/7655
											
										
										
											2018-03-27 12:58:10 +02:00
+								        /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
 								         * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
 								         * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
 								         * & group is just as nasty as acquiring a reference to the user keyring. */
 								        saved_uid = getuid();
 								        saved_gid = getgid();
 								        if (gid_is_valid(gid) && gid != saved_gid) {
 								                if (setregid(gid, -1) < 0)
 								                        return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
 								        }
 								        if (uid_is_valid(uid) && uid != saved_uid) {
 								                if (setreuid(uid, -1) < 0) {
 								                        r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
 								                        goto out;
 								                }
 								        }
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
 								        if (keyring == -1) {
 								                if (errno == ENOSYS)
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                        log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								                else if (IN_SET(errno, EACCES, EPERM))
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                        log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								                else if (errno == EDQUOT)
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                        log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								                else
-												core: use setreuid/setregid trick to create session keyring with right ownership (#8447)

Re-use the hacks used to link user keyring, when creating the session
keyring. This way changing ownership of the keyring is not required, and thus
incovation_id can be correctly created in restricted environments.

Creating invocation_id with root permissions works and linking it into session
keyring works, as at that point session keyring is possessed.

Simple way to validate this is with following commands:

$ journalctl -f &
$ sudo systemd-run --uid 1000 /bin/sh -c 'keyctl describe @s; keyctl list @s; keyctl read `keyctl search @s user invocation_id`'

which now works in LXD containers as well as on the host.

Fixes: https://github.com/systemd/systemd/issues/7655
											
										
										
											2018-03-27 12:58:10 +02:00
+								                        r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
-												core: use setreuid/setregid trick to create session keyring with right ownership (#8447)

Re-use the hacks used to link user keyring, when creating the session
keyring. This way changing ownership of the keyring is not required, and thus
incovation_id can be correctly created in restricted environments.

Creating invocation_id with root permissions works and linking it into session
keyring works, as at that point session keyring is possessed.

Simple way to validate this is with following commands:

$ journalctl -f &
$ sudo systemd-run --uid 1000 /bin/sh -c 'keyctl describe @s; keyctl list @s; keyctl read `keyctl search @s user invocation_id`'

which now works in LXD containers as well as on the host.

Fixes: https://github.com/systemd/systemd/issues/7655
											
										
										
											2018-03-27 12:58:10 +02:00
+								                goto out;
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        }
-												core: use setreuid/setregid trick to create session keyring with right ownership (#8447)

Re-use the hacks used to link user keyring, when creating the session
keyring. This way changing ownership of the keyring is not required, and thus
incovation_id can be correctly created in restricted environments.

Creating invocation_id with root permissions works and linking it into session
keyring works, as at that point session keyring is possessed.

Simple way to validate this is with following commands:

$ journalctl -f &
$ sudo systemd-run --uid 1000 /bin/sh -c 'keyctl describe @s; keyctl list @s; keyctl read `keyctl search @s user invocation_id`'

which now works in LXD containers as well as on the host.

Fixes: https://github.com/systemd/systemd/issues/7655
											
										
										
											2018-03-27 12:58:10 +02:00
+								        /* When requested link the user keyring into the session keyring. */
 								        if (context->keyring_mode == EXEC_KEYRING_SHARED) {
 								                if (keyctl(KEYCTL_LINK,
 								                           KEY_SPEC_USER_KEYRING,
 								                           KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
 								                        r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
 								                        goto out;
 								                }
 								        }
 								        /* Restore uid/gid back */
 								        if (uid_is_valid(uid) && uid != saved_uid) {
 								                if (setreuid(saved_uid, -1) < 0) {
 								                        r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
 								                        goto out;
 								                }
 								        }
 								        if (gid_is_valid(gid) && gid != saved_gid) {
 								                if (setregid(saved_gid, -1) < 0)
 								                        return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
 								        }
 								        /* Populate they keyring with the invocation ID by default, as original saved_uid. */
-												core: store the invocation ID in the per-service keyring

Let's store the invocation ID in the per-service keyring as a root-owned key,
with strict access rights. This has the advantage over the environment-based ID
passing that it also works from SUID binaries (as they key cannot be overidden
by unprivileged code starting them), in contrast to the secure_getenv() based
mode.

The invocation ID is now passed in three different ways to a service:

- As environment variable $INVOCATION_ID. This is easy to use, but may be
  overriden by unprivileged code (which might be a bad or a good thing), which
  means it's incompatible with SUID code (see above).

- As extended attribute on the service cgroup. This cannot be overriden by
  unprivileged code, and may be queried safely from "outside" of a service.
  However, it is incompatible with containers right now, as unprivileged
  containers generally cannot set xattrs on cgroupfs.

- As "invocation_id" key in the kernel keyring. This has the benefit that the
  key cannot be changed by unprivileged service code, and thus is safe to
  access from SUID code (see above). But do note that service code can replace
  the session keyring with a fresh one that lacks the key. However in that case
  the key will not be owned by root, which is easily detectable. The keyring is
  also incompatible with containers right now, as it is not properly namespace
  aware (but this is being worked on), and thus most container managers mask
  the keyring-related system calls.

Ideally we'd only have one way to pass the invocation ID, but the different
ways all have limitations. The invocation ID hookup in journald is currently
only available on the host but not in containers, due to the mentioned
limitations.

How to verify the new invocation ID in the keyring:

 # systemd-run -t /bin/sh
 Running as unit: run-rd917366c04f847b480d486017f7239d6.service
 Press ^] three times within 1s to disconnect TTY.
 # keyctl show
 Session Keyring
  680208392 --alswrv      0     0  keyring: _ses
  250926536 ----s-rv      0     0   \_ user: invocation_id
 # keyctl request user invocation_id
 250926536
 # keyctl read 250926536
 16 bytes of data in key:
 9c96317c ac64495a a42b9cd7 4f3ff96b
 # echo $INVOCATION_ID
 9c96317cac64495aa42b9cd74f3ff96b
 # ^D

This creates a new transient service runnint a shell. Then verifies the
contents of the keyring, requests the invocation ID key, and reads its payload.
For comparison the invocation ID as passed via the environment variable is also
displayed.

											
										
										
											2016-12-02 15:05:55 +01:00
+								        if (!sd_id128_is_null(u->invocation_id)) {
 								                key_serial_t key;
 								                key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
 								                if (key == -1)
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                        log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
-												core: store the invocation ID in the per-service keyring

Let's store the invocation ID in the per-service keyring as a root-owned key,
with strict access rights. This has the advantage over the environment-based ID
passing that it also works from SUID binaries (as they key cannot be overidden
by unprivileged code starting them), in contrast to the secure_getenv() based
mode.

The invocation ID is now passed in three different ways to a service:

- As environment variable $INVOCATION_ID. This is easy to use, but may be
  overriden by unprivileged code (which might be a bad or a good thing), which
  means it's incompatible with SUID code (see above).

- As extended attribute on the service cgroup. This cannot be overriden by
  unprivileged code, and may be queried safely from "outside" of a service.
  However, it is incompatible with containers right now, as unprivileged
  containers generally cannot set xattrs on cgroupfs.

- As "invocation_id" key in the kernel keyring. This has the benefit that the
  key cannot be changed by unprivileged service code, and thus is safe to
  access from SUID code (see above). But do note that service code can replace
  the session keyring with a fresh one that lacks the key. However in that case
  the key will not be owned by root, which is easily detectable. The keyring is
  also incompatible with containers right now, as it is not properly namespace
  aware (but this is being worked on), and thus most container managers mask
  the keyring-related system calls.

Ideally we'd only have one way to pass the invocation ID, but the different
ways all have limitations. The invocation ID hookup in journald is currently
only available on the host but not in containers, due to the mentioned
limitations.

How to verify the new invocation ID in the keyring:

 # systemd-run -t /bin/sh
 Running as unit: run-rd917366c04f847b480d486017f7239d6.service
 Press ^] three times within 1s to disconnect TTY.
 # keyctl show
 Session Keyring
  680208392 --alswrv      0     0  keyring: _ses
  250926536 ----s-rv      0     0   \_ user: invocation_id
 # keyctl request user invocation_id
 250926536
 # keyctl read 250926536
 16 bytes of data in key:
 9c96317c ac64495a a42b9cd7 4f3ff96b
 # echo $INVOCATION_ID
 9c96317cac64495aa42b9cd74f3ff96b
 # ^D

This creates a new transient service runnint a shell. Then verifies the
contents of the keyring, requests the invocation ID key, and reads its payload.
For comparison the invocation ID as passed via the environment variable is also
displayed.

											
										
										
											2016-12-02 15:05:55 +01:00
+								                else {
 								                        if (keyctl(KEYCTL_SETPERM, key,
 								                                   KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
 								                                   KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
-												core: use setreuid/setregid trick to create session keyring with right ownership (#8447)

Re-use the hacks used to link user keyring, when creating the session
keyring. This way changing ownership of the keyring is not required, and thus
incovation_id can be correctly created in restricted environments.

Creating invocation_id with root permissions works and linking it into session
keyring works, as at that point session keyring is possessed.

Simple way to validate this is with following commands:

$ journalctl -f &
$ sudo systemd-run --uid 1000 /bin/sh -c 'keyctl describe @s; keyctl list @s; keyctl read `keyctl search @s user invocation_id`'

which now works in LXD containers as well as on the host.

Fixes: https://github.com/systemd/systemd/issues/7655
											
										
										
											2018-03-27 12:58:10 +02:00
+								                                r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
-												core: store the invocation ID in the per-service keyring

Let's store the invocation ID in the per-service keyring as a root-owned key,
with strict access rights. This has the advantage over the environment-based ID
passing that it also works from SUID binaries (as they key cannot be overidden
by unprivileged code starting them), in contrast to the secure_getenv() based
mode.

The invocation ID is now passed in three different ways to a service:

- As environment variable $INVOCATION_ID. This is easy to use, but may be
  overriden by unprivileged code (which might be a bad or a good thing), which
  means it's incompatible with SUID code (see above).

- As extended attribute on the service cgroup. This cannot be overriden by
  unprivileged code, and may be queried safely from "outside" of a service.
  However, it is incompatible with containers right now, as unprivileged
  containers generally cannot set xattrs on cgroupfs.

- As "invocation_id" key in the kernel keyring. This has the benefit that the
  key cannot be changed by unprivileged service code, and thus is safe to
  access from SUID code (see above). But do note that service code can replace
  the session keyring with a fresh one that lacks the key. However in that case
  the key will not be owned by root, which is easily detectable. The keyring is
  also incompatible with containers right now, as it is not properly namespace
  aware (but this is being worked on), and thus most container managers mask
  the keyring-related system calls.

Ideally we'd only have one way to pass the invocation ID, but the different
ways all have limitations. The invocation ID hookup in journald is currently
only available on the host but not in containers, due to the mentioned
limitations.

How to verify the new invocation ID in the keyring:

 # systemd-run -t /bin/sh
 Running as unit: run-rd917366c04f847b480d486017f7239d6.service
 Press ^] three times within 1s to disconnect TTY.
 # keyctl show
 Session Keyring
  680208392 --alswrv      0     0  keyring: _ses
  250926536 ----s-rv      0     0   \_ user: invocation_id
 # keyctl request user invocation_id
 250926536
 # keyctl read 250926536
 16 bytes of data in key:
 9c96317c ac64495a a42b9cd7 4f3ff96b
 # echo $INVOCATION_ID
 9c96317cac64495aa42b9cd74f3ff96b
 # ^D

This creates a new transient service runnint a shell. Then verifies the
contents of the keyring, requests the invocation ID key, and reads its payload.
For comparison the invocation ID as passed via the environment variable is also
displayed.

											
										
										
											2016-12-02 15:05:55 +01:00
+								                }
 								        }
-												core: use setreuid/setregid trick to create session keyring with right ownership (#8447)

Re-use the hacks used to link user keyring, when creating the session
keyring. This way changing ownership of the keyring is not required, and thus
incovation_id can be correctly created in restricted environments.

Creating invocation_id with root permissions works and linking it into session
keyring works, as at that point session keyring is possessed.

Simple way to validate this is with following commands:

$ journalctl -f &
$ sudo systemd-run --uid 1000 /bin/sh -c 'keyctl describe @s; keyctl list @s; keyctl read `keyctl search @s user invocation_id`'

which now works in LXD containers as well as on the host.

Fixes: https://github.com/systemd/systemd/issues/7655
											
										
										
											2018-03-27 12:58:10 +02:00
+								out:
 								        /* Revert back uid & gid for the the last time, and exit */
 								        /* no extra logging, as only the first already reported error matters */
 								        if (getuid() != saved_uid)
 								                (void) setreuid(saved_uid, -1);
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
-												core: use setreuid/setregid trick to create session keyring with right ownership (#8447)

Re-use the hacks used to link user keyring, when creating the session
keyring. This way changing ownership of the keyring is not required, and thus
incovation_id can be correctly created in restricted environments.

Creating invocation_id with root permissions works and linking it into session
keyring works, as at that point session keyring is possessed.

Simple way to validate this is with following commands:

$ journalctl -f &
$ sudo systemd-run --uid 1000 /bin/sh -c 'keyctl describe @s; keyctl list @s; keyctl read `keyctl search @s user invocation_id`'

which now works in LXD containers as well as on the host.

Fixes: https://github.com/systemd/systemd/issues/7655
											
										
										
											2018-03-27 12:58:10 +02:00
+								        if (getgid() != saved_gid)
 								                (void) setregid(saved_gid, -1);
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
-												core: use setreuid/setregid trick to create session keyring with right ownership (#8447)

Re-use the hacks used to link user keyring, when creating the session
keyring. This way changing ownership of the keyring is not required, and thus
incovation_id can be correctly created in restricted environments.

Creating invocation_id with root permissions works and linking it into session
keyring works, as at that point session keyring is possessed.

Simple way to validate this is with following commands:

$ journalctl -f &
$ sudo systemd-run --uid 1000 /bin/sh -c 'keyctl describe @s; keyctl list @s; keyctl read `keyctl search @s user invocation_id`'

which now works in LXD containers as well as on the host.

Fixes: https://github.com/systemd/systemd/issues/7655
											
										
										
											2018-03-27 12:58:10 +02:00
+								        return r;
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								}
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								        assert(array);
 								        assert(n);
 								        if (!pair)
 								                return;
 								        if (pair[0] >= 0)
 								                array[(*n)++] = pair[0];
 								        if (pair[1] >= 0)
 								                array[(*n)++] = pair[1];
 								}
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								static int close_remaining_fds(
 								                const ExecParameters *params,
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								                const ExecRuntime *runtime,
 								                const DynamicCreds *dcreds,
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								                int user_lookup_fd,
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								                int socket_fd,
-												core: introduce new Type=exec service type

Users are often surprised that "systemd-run" command lines like
"systemd-run -p User=idontexist /bin/true" will return successfully,
even though the logs show that the process couldn't be invoked, as the
user "idontexist" doesn't exist. This is because Type=simple will only
wait until fork() succeeded before returning start-up success.

This patch adds a new service type Type=exec, which is very similar to
Type=simple, but waits until the child process completed the execve()
before returning success. It uses a pipe that has O_CLOEXEC set for this
logic, so that the kernel automatically sends POLLHUP on it when the
execve() succeeded but leaves the pipe open if not. This means PID 1
waits exactly until the execve() succeeded in the child, and not longer
and not shorter, which is the desired functionality.

Making use of this new functionality, the command line
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
as expected.

											
										
										
											2018-07-17 11:47:14 +02:00
+								                int exec_fd,
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								                int *fds, size_t n_fds) {
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								        size_t n_dont_close = 0;
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								        int dont_close[n_fds + 12];
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
 								        assert(params);
 								        if (params->stdin_fd >= 0)
 								                dont_close[n_dont_close++] = params->stdin_fd;
 								        if (params->stdout_fd >= 0)
 								                dont_close[n_dont_close++] = params->stdout_fd;
 								        if (params->stderr_fd >= 0)
 								                dont_close[n_dont_close++] = params->stderr_fd;
 								        if (socket_fd >= 0)
 								                dont_close[n_dont_close++] = socket_fd;
-												core: introduce new Type=exec service type

Users are often surprised that "systemd-run" command lines like
"systemd-run -p User=idontexist /bin/true" will return successfully,
even though the logs show that the process couldn't be invoked, as the
user "idontexist" doesn't exist. This is because Type=simple will only
wait until fork() succeeded before returning start-up success.

This patch adds a new service type Type=exec, which is very similar to
Type=simple, but waits until the child process completed the execve()
before returning success. It uses a pipe that has O_CLOEXEC set for this
logic, so that the kernel automatically sends POLLHUP on it when the
execve() succeeded but leaves the pipe open if not. This means PID 1
waits exactly until the execve() succeeded in the child, and not longer
and not shorter, which is the desired functionality.

Making use of this new functionality, the command line
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
as expected.

											
										
										
											2018-07-17 11:47:14 +02:00
+								        if (exec_fd >= 0)
 								                dont_close[n_dont_close++] = exec_fd;
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								        if (n_fds > 0) {
 								                memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
 								                n_dont_close += n_fds;
 								        }
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								        if (runtime)
 								                append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
 								        if (dcreds) {
 								                if (dcreds->user)
 								                        append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
 								                if (dcreds->group)
 								                        append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								        }
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								        if (user_lookup_fd >= 0)
 								                dont_close[n_dont_close++] = user_lookup_fd;
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								        return close_all_fds(dont_close, n_dont_close);
 								}
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								static int send_user_lookup(
 								                Unit *unit,
 								                int user_lookup_fd,
 								                uid_t uid,
 								                gid_t gid) {
 								        assert(unit);
 								        /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
 								         * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
 								         * specified. */
 								        if (user_lookup_fd < 0)
 								                return 0;
 								        if (!uid_is_valid(uid) && !gid_is_valid(gid))
 								                return 0;
 								        if (writev(user_lookup_fd,
 								               (struct iovec[]) {
-												io-util: add new IOVEC_INIT/IOVEC_MAKE macros

This adds IOVEC_INIT() and IOVEC_MAKE() for initializing iovec structures
from a pointer and a size. On top of these IOVEC_INIT_STRING() and
IOVEC_MAKE_STRING() are added which take a string and automatically
determine the size of the string using strlen().

This patch removes the old IOVEC_SET_STRING() macro, given that
IOVEC_MAKE_STRING() is now useful for similar purposes. Note that the
old IOVEC_SET_STRING() invocations were two characters shorter than the
new ones using IOVEC_MAKE_STRING(), but I think the new syntax is more
readable and more generic as it simply resolves to a C99 literal
structure initialization. Moreover, we can use very similar syntax now
for initializing strings and pointer+size iovec entries. We canalso use
the new macros to initialize function parameters on-the-fly or array
definitions. And given that we shouldn't have so many ways to do the
same stuff, let's just settle on the new macros.

(This also converts some code to use _cleanup_ where dynamically
allocated strings were using IOVEC_SET_STRING() before, to modernize
things a bit)

											
										
										
											2017-09-21 13:52:34 +02:00
+								                           IOVEC_INIT(&uid, sizeof(uid)),
 								                           IOVEC_INIT(&gid, sizeof(gid)),
 								                           IOVEC_INIT_STRING(unit->id) }, 3) < 0)
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								                return -errno;
 								        return 0;
 								}
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
 								        int r;
 								        assert(c);
 								        assert(home);
 								        assert(buf);
 								        /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
 								        if (*home)
 								                return 0;
 								        if (!c->working_directory_home)
 								                return 0;
 								        if (uid == 0) {
 								                /* Hardcode /root as home directory for UID 0 */
 								                *home = "/root";
 								                return 1;
 								        }
 								        r = get_home_dir(buf);
 								        if (r < 0)
 								                return r;
 								        *home = *buf;
 								        return 1;
 								}
-												core: when looking for a UID to use for a dynamic UID start with the current owner of the StateDirectory= and friends

Let's optimize dynamic UID allocation a bit: if a StateDirectory= (or
suchlike) is configured, we start our allocation loop from that UID and
use it if it currently isn't used otherwise. This is beneficial as it
saves us from having to expensively recursively chown() these
directories in the typical case (which StateDirectory= does when it
notices that the owner of the directory doesn't match the UID picked).

With this in place we now have the a three-phase logic for allocating a
dynamic UID:

a) first, we try to use the owning UID of StateDirectory=,
   CacheDirectory=, LogDirectory= if that exists and is currently
   otherwise unused.

b) if that didn't work out, we hash the UID from the service name

c) if that didn't yield an unused UID either, randomly pick new ones
   until we find a free one.

											
										
										
											2017-09-28 20:28:09 +02:00
+								static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
 								        _cleanup_strv_free_ char ** list = NULL;
 								        ExecDirectoryType t;
 								        int r;
 								        assert(c);
 								        assert(p);
 								        assert(ret);
 								        assert(c->dynamic_user);
 								        /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
 								         * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
 								         * directories. */
 								        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
 								                char **i;
 								                if (t == EXEC_DIRECTORY_CONFIGURATION)
 								                        continue;
 								                if (!p->prefix[t])
 								                        continue;
 								                STRV_FOREACH(i, c->directories[t].paths) {
 								                        char *e;
-												core/execute: do not create RuntimeDirectory= under private/ sub-directory

RuntimeDirectory= often used for sharing files or sockets with other
services. So, if creating them under private/ sub-directory, we cannot
set DynamicUser= to service units which want to share something through
RuntimeDirectory=.
This makes the directories given by RuntimeDirectory= are created under
/run/ even if DynamicUser= is set.

Fixes #7260.

											
										
										
											2017-11-08 07:50:58 +01:00
+								                        if (t == EXEC_DIRECTORY_RUNTIME)
 								                                e = strjoin(p->prefix[t], "/", *i);
 								                        else
 								                                e = strjoin(p->prefix[t], "/private/", *i);
-												core: when looking for a UID to use for a dynamic UID start with the current owner of the StateDirectory= and friends

Let's optimize dynamic UID allocation a bit: if a StateDirectory= (or
suchlike) is configured, we start our allocation loop from that UID and
use it if it currently isn't used otherwise. This is beneficial as it
saves us from having to expensively recursively chown() these
directories in the typical case (which StateDirectory= does when it
notices that the owner of the directory doesn't match the UID picked).

With this in place we now have the a three-phase logic for allocating a
dynamic UID:

a) first, we try to use the owning UID of StateDirectory=,
   CacheDirectory=, LogDirectory= if that exists and is currently
   otherwise unused.

b) if that didn't work out, we hash the UID from the service name

c) if that didn't yield an unused UID either, randomly pick new ones
   until we find a free one.

											
										
										
											2017-09-28 20:28:09 +02:00
+								                        if (!e)
 								                                return -ENOMEM;
 								                        r = strv_consume(&list, e);
 								                        if (r < 0)
 								                                return r;
 								                }
 								        }
-												macro: introduce TAKE_PTR() macro

This macro will read a pointer of any type, return it, and set the
pointer to NULL. This is useful as an explicit concept of passing
ownership of a memory area between pointers.

This takes inspiration from Rust:

https://doc.rust-lang.org/std/option/enum.Option.html#method.take

and was suggested by Alan Jenkins (@sourcejedi).

It drops ~160 lines of code from our codebase, which makes me like it.
Also, I think it clarifies passing of ownership, and thus helps
readability a bit (at least for the initiated who know the new macro)

											
										
										
											2018-03-22 16:53:26 +01:00
+								        *ret = TAKE_PTR(list);
-												core: when looking for a UID to use for a dynamic UID start with the current owner of the StateDirectory= and friends

Let's optimize dynamic UID allocation a bit: if a StateDirectory= (or
suchlike) is configured, we start our allocation loop from that UID and
use it if it currently isn't used otherwise. This is beneficial as it
saves us from having to expensively recursively chown() these
directories in the typical case (which StateDirectory= does when it
notices that the owner of the directory doesn't match the UID picked).

With this in place we now have the a three-phase logic for allocating a
dynamic UID:

a) first, we try to use the owning UID of StateDirectory=,
   CacheDirectory=, LogDirectory= if that exists and is currently
   otherwise unused.

b) if that didn't work out, we hash the UID from the service name

c) if that didn't yield an unused UID either, randomly pick new ones
   until we find a free one.

											
										
										
											2017-09-28 20:28:09 +02:00
 								        return 0;
 								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								static char *exec_command_line(char **argv);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								static int exec_child(
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                Unit *unit,
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								                const ExecCommand *command,
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                const ExecContext *context,
 								                const ExecParameters *params,
 								                ExecRuntime *runtime,
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								                DynamicCreds *dcreds,
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                int socket_fd,
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                int named_iofds[3],
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								                int *fds,
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								                size_t n_socket_fds,
-												core: swap order of "n_storage_fds" and "n_socket_fds" parameters

When process fd lists to pass to activated programs we always place the
socket activation fds first, and the storage fds last. Irritatingly in
almost all calls the "n_storage_fds" parameter (i.e. the number of
storage fds to pass) came first so far, and the "n_socket_fds" parameter
second. Let's clean this up, and specify the number of fds in the order
the fds themselves are passed.

(Also, let's fix one more case where "unsigned" was used to size an
array, while we should use "size_t" instead.)

											
										
										
											2018-07-05 09:56:54 +02:00
+								                size_t n_storage_fds,
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                char **files_env,
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								                int user_lookup_fd,
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                int *exit_status) {
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
-												core: introduce new Type=exec service type

Users are often surprised that "systemd-run" command lines like
"systemd-run -p User=idontexist /bin/true" will return successfully,
even though the logs show that the process couldn't be invoked, as the
user "idontexist" doesn't exist. This is because Type=simple will only
wait until fork() succeeded before returning start-up success.

This patch adds a new service type Type=exec, which is very similar to
Type=simple, but waits until the child process completed the execve()
before returning success. It uses a pipe that has O_CLOEXEC set for this
logic, so that the kernel automatically sends POLLHUP on it when the
execve() succeeded but leaves the pipe open if not. This means PID 1
waits exactly until the execve() succeeded in the child, and not longer
and not shorter, which is the desired functionality.

Making use of this new functionality, the command line
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
as expected.

											
										
										
											2018-07-17 11:47:14 +02:00
+								        int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        _cleanup_free_ gid_t *supplementary_gids = NULL;
 								        const char *username = NULL, *groupname = NULL;
-												core: introduce new Type=exec service type

Users are often surprised that "systemd-run" command lines like
"systemd-run -p User=idontexist /bin/true" will return successfully,
even though the logs show that the process couldn't be invoked, as the
user "idontexist" doesn't exist. This is because Type=simple will only
wait until fork() succeeded before returning start-up success.

This patch adds a new service type Type=exec, which is very similar to
Type=simple, but waits until the child process completed the execve()
before returning success. It uses a pipe that has O_CLOEXEC set for this
logic, so that the kernel automatically sends POLLHUP on it when the
execve() succeeded but leaves the pipe open if not. This means PID 1
waits exactly until the execve() succeeded in the child, and not longer
and not shorter, which is the desired functionality.

Making use of this new functionality, the command line
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
as expected.

											
										
										
											2018-07-17 11:47:14 +02:00
+								        _cleanup_free_ char *home_buffer = NULL;
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
+								        const char *home = NULL, *shell = NULL;
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								        dev_t journal_stream_dev = 0;
 								        ino_t journal_stream_ino = 0;
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
 								                needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
 								                needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
 								                needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SELINUX
-												execute: define the variable mac_selinux_contex_net only when build with SELinux

											
										
										
											2017-12-05 06:07:38 +01:00
+								        _cleanup_free_ char *mac_selinux_context_net = NULL;
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								        bool use_selinux = false;
-												core: define variables only when they are required

Follow-up for 7f18ef0a555a3c3cef08e0965dc453fe5954b5a7.

											
										
										
											2017-08-02 07:38:08 +02:00
+								#endif
-												build-sys: s/HAVE_SMACK/ENABLE_SMACK/

Same justification as for HAVE_UTMP.

											
										
										
											2017-10-03 12:22:40 +02:00
+								#if ENABLE_SMACK
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								        bool use_smack = false;
-												core: define variables only when they are required

Follow-up for 7f18ef0a555a3c3cef08e0965dc453fe5954b5a7.

											
										
										
											2017-08-02 07:38:08 +02:00
+								#endif
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_APPARMOR
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								        bool use_apparmor = false;
-												core: define variables only when they are required

Follow-up for 7f18ef0a555a3c3cef08e0965dc453fe5954b5a7.

											
										
										
											2017-08-02 07:38:08 +02:00
+								#endif
-												treewide: introduce UID_INVALID (and friends) as macro for (uid_t) -1

											
										
										
											2014-11-28 20:51:01 +01:00
+								        uid_t uid = UID_INVALID;
 								        gid_t gid = GID_INVALID;
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								        size_t n_fds;
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        ExecDirectoryType dt;
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        int secure_bits;
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        assert(unit);
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								        assert(command);
 								        assert(context);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        assert(params);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        assert(exit_status);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
 								        rename_process_from_path(command->path);
 								        /* We reset exactly these signals, since they are the
 								         * only ones we set to SIG_IGN in the main daemon. All
 								         * others we leave untouched because we set them to
 								         * SIG_DFL or a valid handler initially, both of which
 								         * will be demoted to SIG_DFL. */
-												tree-wide: whenever we fork off a foreign child process reset signal mask/handlers

Also, when the child is potentially long-running make sure to set a
death signal.

Also, ignore the result of the reset operations explicitly by casting
them to (void).

											
										
										
											2015-05-31 23:55:55 +02:00
+								        (void) default_signals(SIGNALS_CRASH_HANDLER,
 								                               SIGNALS_IGNORE, -1);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
 								        if (context->ignore_sigpipe)
-												tree-wide: whenever we fork off a foreign child process reset signal mask/handlers

Also, when the child is potentially long-running make sure to set a
death signal.

Also, ignore the result of the reset operations explicitly by casting
them to (void).

											
										
										
											2015-05-31 23:55:55 +02:00
+								                (void) ignore_signals(SIGPIPE, -1);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        r = reset_signal_mask();
 								        if (r < 0) {
 								                *exit_status = EXIT_SIGNAL_MASK;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (params->idle_pipe)
 								                do_idle_pipe_dance(params->idle_pipe);
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
-												execute: make use of the new logging mode in execute.c

											
										
										
											2017-09-26 17:45:32 +02:00
+								        /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
 								         * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
 								         * any fds open we don't really want open during the transition. In order to make logging work, we switch the
 								         * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        log_forget_fds();
-												execute: make use of the new logging mode in execute.c

											
										
										
											2017-09-26 17:45:32 +02:00
+								        log_set_open_when_needed(true);
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
-												execute: let's close glibc syslog channels too

Just in case something opened them, let's make sure glibc invalidates
them too.

Thankfully so far no library opened log channels behind our back, at
least as far as I know, hence this is actually a NOP, but let's better
be safe than sorry.

											
										
										
											2017-09-26 17:52:25 +02:00
+								        /* In case anything used libc syslog(), close this here, too */
 								        closelog();
-												core: introduce new Type=exec service type

Users are often surprised that "systemd-run" command lines like
"systemd-run -p User=idontexist /bin/true" will return successfully,
even though the logs show that the process couldn't be invoked, as the
user "idontexist" doesn't exist. This is because Type=simple will only
wait until fork() succeeded before returning start-up success.

This patch adds a new service type Type=exec, which is very similar to
Type=simple, but waits until the child process completed the execve()
before returning success. It uses a pipe that has O_CLOEXEC set for this
logic, so that the kernel automatically sends POLLHUP on it when the
execve() succeeded but leaves the pipe open if not. This means PID 1
waits exactly until the execve() succeeded in the child, and not longer
and not shorter, which is the desired functionality.

Making use of this new functionality, the command line
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
as expected.

											
										
										
											2018-07-17 11:47:14 +02:00
+								        n_fds = n_socket_fds + n_storage_fds;
 								        r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r < 0) {
 								                *exit_status = EXIT_FDS;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
+								        }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (!context->same_pgrp)
 								                if (setsid() < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_SETSID;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                }
-												core: add minimal templating system

											
										
										
											2010-04-15 03:11:11 +02:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								        exec_context_tty_reset(context, params);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
-												core: confirm_spawn: always accept units with same_pgrp set for now

For some reasons units remaining in the same process group as PID 1
(same_pgrp=true) fail to acquire the console even if it's not taken by anyone.

So always accept for units with same_pgrp set for now.

											
										
										
											2016-11-14 17:37:40 +01:00
+								        if (unit_shall_confirm_spawn(unit)) {
-												core: allow to redirect confirmation messages to a different console

It's rather hard to parse the confirmation messages (enabled with
systemd.confirm_spawn=true) amongst the status messages and the kernel
ones (if enabled).

This patch gives the possibility to the user to redirect the confirmation
message to a different virtual console, either by giving its name or its path,
so those messages are separated from the other ones and easier to read.

											
										
										
											2016-11-02 10:38:22 +01:00
+								                const char *vc = params->confirm_spawn;
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                _cleanup_free_ char *cmdline = NULL;
-												core: drop "argv" field from ExecParameter structure

We always initialize it from the same field in ExecCommand anyway, hence
there's no point in passing it separately to exec_spawn(), after all we
already pass the ExecCommand structure itself anyway.

No change in behaviour.

											
										
										
											2018-07-17 18:47:32 +02:00
+								                cmdline = exec_command_line(command->argv);
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                if (!cmdline) {
-												execute: improve and augment execution log messages

Let's generate friendly messages for more cases, and make slight
adjustments to the existing messages.

											
										
										
											2017-09-15 16:42:09 +02:00
+								                        *exit_status = EXIT_MEMORY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_oom();
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
-												core: add 'i' in confirm spawn to give a short summary of the unit to spawn

											
										
										
											2016-11-12 14:55:12 +01:00
+								                r = ask_for_confirmation(vc, unit, cmdline);
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                if (r != CONFIRM_EXECUTE) {
 								                        if (r == CONFIRM_PRETEND_SUCCESS) {
 								                                *exit_status = EXIT_SUCCESS;
 								                                return 0;
 								                        }
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_CONFIRM;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        log_unit_error(unit, "Execution cancelled by the user");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                        return -ECANCELED;
 								                }
 								        }
-												execute: improve exec_spawn() logging

											
										
										
											2010-04-10 17:46:01 +02:00
-												pid1: tell PAM/NSS modules why we are calling them

											
										
										
											2018-07-04 15:35:28 +02:00
+								        /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
 								         * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
 								         * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
 								         * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
 								         * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
 								        if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
 								            setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
 								                *exit_status = EXIT_MEMORY;
 								                return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
 								        }
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								        if (context->dynamic_user && dcreds) {
-												core: when looking for a UID to use for a dynamic UID start with the current owner of the StateDirectory= and friends

Let's optimize dynamic UID allocation a bit: if a StateDirectory= (or
suchlike) is configured, we start our allocation loop from that UID and
use it if it currently isn't used otherwise. This is beneficial as it
saves us from having to expensively recursively chown() these
directories in the typical case (which StateDirectory= does when it
notices that the owner of the directory doesn't match the UID picked).

With this in place we now have the a three-phase logic for allocating a
dynamic UID:

a) first, we try to use the owning UID of StateDirectory=,
   CacheDirectory=, LogDirectory= if that exists and is currently
   otherwise unused.

b) if that didn't work out, we hash the UID from the service name

c) if that didn't yield an unused UID either, randomly pick new ones
   until we find a free one.

											
										
										
											2017-09-28 20:28:09 +02:00
+								                _cleanup_strv_free_ char **suggested_paths = NULL;
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
-												pid1: tell PAM/NSS modules why we are calling them

											
										
										
											2018-07-04 15:35:28 +02:00
+								                /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
 								                 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
-												nss: add new "nss-systemd" NSS module for mapping dynamic users

With this NSS module all dynamic service users will be resolvable via NSS like
any real user.

											
										
										
											2016-07-14 19:19:49 +02:00
+								                if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
 								                        *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
-												nss: add new "nss-systemd" NSS module for mapping dynamic users

With this NSS module all dynamic service users will be resolvable via NSS like
any real user.

											
										
										
											2016-07-14 19:19:49 +02:00
+								                }
-												core: when looking for a UID to use for a dynamic UID start with the current owner of the StateDirectory= and friends

Let's optimize dynamic UID allocation a bit: if a StateDirectory= (or
suchlike) is configured, we start our allocation loop from that UID and
use it if it currently isn't used otherwise. This is beneficial as it
saves us from having to expensively recursively chown() these
directories in the typical case (which StateDirectory= does when it
notices that the owner of the directory doesn't match the UID picked).

With this in place we now have the a three-phase logic for allocating a
dynamic UID:

a) first, we try to use the owning UID of StateDirectory=,
   CacheDirectory=, LogDirectory= if that exists and is currently
   otherwise unused.

b) if that didn't work out, we hash the UID from the service name

c) if that didn't yield an unused UID either, randomly pick new ones
   until we find a free one.

											
										
										
											2017-09-28 20:28:09 +02:00
+								                r = compile_suggested_paths(context, params, &suggested_paths);
 								                if (r < 0) {
 								                        *exit_status = EXIT_MEMORY;
 								                        return log_oom();
 								                }
 								                r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                if (r < 0) {
 								                        *exit_status = EXIT_USER;
-												core: fix invalid error message

The error message corresponds to EILSEQ is "Invalid or incomplete
multibyte or wide character", and is not suitable in this case.
So, let's show a custom error message when the function
dynamic_creds_realize() returns -EILSEQ.

											
										
										
											2017-10-18 01:57:54 +02:00
+								                        if (r == -EILSEQ) {
 								                                log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
 								                                return -EOPNOTSUPP;
 								                        }
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                }
-												pid1: provide a more detailed error message when execution fails (#5074)

Fixes #5000.
											
										
										
											2017-01-18 04:38:55 +01:00
+								                if (!uid_is_valid(uid)) {
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								                        *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
-												pid1: provide a more detailed error message when execution fails (#5074)

Fixes #5000.
											
										
										
											2017-01-18 04:38:55 +01:00
+								                        return -ESRCH;
 								                }
 								                if (!gid_is_valid(gid)) {
 								                        *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								                        return -ESRCH;
 								                }
-												core: fix group ownership when Group is set

When Group is set in the unit, the runtime directories are owned by
this group and not the default group of the user (same for cgroup paths
and standard outputs)

Fix #1231

											
										
										
											2015-09-21 15:45:51 +02:00
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								                if (dcreds->user)
 								                        username = dcreds->user->name;
 								        } else {
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
 								                if (r < 0) {
 								                        *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
-												core: fix group ownership when Group is set

When Group is set in the unit, the runtime directories are owned by
this group and not the default group of the user (same for cgroup paths
and standard outputs)

Fix #1231

											
										
										
											2015-09-21 15:45:51 +02:00
+								                }
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                r = get_fixed_group(context, &groupname, &gid);
 								                if (r < 0) {
 								                        *exit_status = EXIT_GROUP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                }
-												core: intialize user aux groups and SupplementaryGroups= when DynamicUser= is set

Make sure that when DynamicUser= is set that we intialize the user
supplementary groups and that we also support SupplementaryGroups=

Fixes: https://github.com/systemd/systemd/issues/4539

Thanks Evgeny Vereshchagin (@evverx)

											
										
										
											2016-11-02 22:42:40 +01:00
+								        }
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
-												core: intialize user aux groups and SupplementaryGroups= when DynamicUser= is set

Make sure that when DynamicUser= is set that we intialize the user
supplementary groups and that we also support SupplementaryGroups=

Fixes: https://github.com/systemd/systemd/issues/4539

Thanks Evgeny Vereshchagin (@evverx)

											
										
										
											2016-11-02 22:42:40 +01:00
+								        /* Initialize user supplementary groups and get SupplementaryGroups= ones */
 								        r = get_supplementary_groups(context, username, groupname, gid,
 								                                     &supplementary_gids, &ngids);
 								        if (r < 0) {
 								                *exit_status = EXIT_GROUP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								        }
-												core: fix group ownership when Group is set

When Group is set in the unit, the runtime directories are owned by
this group and not the default group of the user (same for cgroup paths
and standard outputs)

Fix #1231

											
										
										
											2015-09-21 15:45:51 +02:00
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								        r = send_user_lookup(unit, user_lookup_fd, uid, gid);
 								        if (r < 0) {
 								                *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								        }
 								        user_lookup_fd = safe_close(user_lookup_fd);
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								        r = acquire_home(context, uid, &home, &home_buffer);
 								        if (r < 0) {
 								                *exit_status = EXIT_CHDIR;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								        }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        /* If a socket is connected to STDIN/STDOUT/STDERR, we
 								         * must sure to drop O_NONBLOCK */
 								        if (socket_fd >= 0)
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								                (void) fd_nonblock(socket_fd, false);
-												yay, we can start socket units

											
										
										
											2010-01-27 04:31:52 +01:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        r = setup_input(context, params, socket_fd, named_iofds);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r < 0) {
 								                *exit_status = EXIT_STDIN;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r < 0) {
 								                *exit_status = EXIT_STDOUT;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r < 0) {
 								                *exit_status = EXIT_STDERR;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
 								        if (params->cgroup_path) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
 								                if (r < 0) {
 								                        *exit_status = EXIT_CGROUP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
-												reset signal mask when forking

											
										
										
											2010-01-27 06:17:51 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												reset signal mask when forking

											
										
										
											2010-01-27 06:17:51 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->oom_score_adjust_set) {
-												process-util: add new helper call for adjusting the OOM score

And let's make use of it in execute.c

											
										
										
											2018-05-07 20:44:41 +02:00
+								                /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
 								                 * prohibit write access to this file, and we shouldn't trip up over that. */
 								                r = set_oom_score_adjust(context->oom_score_adjust);
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                if (IN_SET(r, -EPERM, -EACCES))
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                else if (r < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_OOM_ADJUST;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
 								        if (context->nice_set)
 								                if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_NICE;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->cpu_sched_set) {
 								                struct sched_param param = {
 								                        .sched_priority = context->cpu_sched_priority,
 								                };
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                r = sched_setscheduler(0,
 								                                       context->cpu_sched_policy |
 								                                       (context->cpu_sched_reset_on_fork ?
 								                                        SCHED_RESET_ON_FORK : 0),
 								                                       &param);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SETSCHEDULER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
-												execute: close inherited fds earlier

											
										
										
											2010-07-12 20:34:53 +02:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												execute: close inherited fds earlier

											
										
										
											2010-07-12 20:34:53 +02:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->cpuset)
 								                if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_CPUAFFINITY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->ioprio_set)
 								                if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_IOPRIO;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                }
-												exec: make sure O_NONBLOCK is off for all sockets passed as STDIN/STDOUT/STDERR

											
										
										
											2010-08-30 23:31:27 +02:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->timer_slack_nsec != NSEC_INFINITY)
 								                if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_TIMERSLACK;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
-												execute: log errors from "sd(EXEC)"

To give the administrator more hints about failures occuring in spawning
of commands than just the exit code, log the strerror.
All fds are closed, so reopen the log.

Related-to: https://bugzilla.redhat.com/show_bug.cgi?id=752901

											
										
										
											2011-11-17 00:21:16 +01:00
+								                }
-												support chrooting/setting of ioprio when spawning

											
										
										
											2010-01-29 20:46:22 +01:00
-												util-lib: wrap personality() to fix up broken glibc error handling (#6766)

glibc appears to propagate different errors in different ways, let's fix
this up, so that our own code doesn't get confused by this.

See #6752 + #6737 for details.

Fixes: #6755
											
										
										
											2017-09-08 16:16:29 +02:00
+								        if (context->personality != PERSONALITY_INVALID) {
 								                r = safe_personality(context->personality);
 								                if (r < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_PERSONALITY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
-												execute: log errors from "sd(EXEC)"

To give the administrator more hints about failures occuring in spawning
of commands than just the exit code, log the strerror.
All fds are closed, so reopen the log.

Related-to: https://bugzilla.redhat.com/show_bug.cgi?id=752901

											
										
										
											2011-11-17 00:21:16 +01:00
+								                }
-												util-lib: wrap personality() to fix up broken glibc error handling (#6766)

glibc appears to propagate different errors in different ways, let's fix
this up, so that our own code doesn't get confused by this.

See #6752 + #6737 for details.

Fixes: #6755
											
										
										
											2017-09-08 16:16:29 +02:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->utmp_id)
-												tree-wide: make use of getpid_cached() wherever we can

This moves pretty much all uses of getpid() over to getpid_raw(). I
didn't specifically check whether the optimization is worth it for each
replacement, but in order to keep things simple and systematic I
switched over everything at once.

											
										
										
											2017-07-20 16:19:18 +02:00
+								                utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
-												core/execute: pass the username to utmp/wtmp database

Before previous commit, username would be NULL for root, and set only
for other users. So the argument passed to utmp_put_init_process()
would be "root" for other users and NULL for root. Seems strange.
Instead, always pass the username if available.

											
										
										
											2017-02-03 17:32:42 +01:00
+								                                      context->tty_path,
-												core: optionally create LOGIN_PROCESS or USER_PROCESS utmp entries

When generating utmp/wtmp entries, optionally add both LOGIN_PROCESS and
INIT_PROCESS entries or even all three of LOGIN_PROCESS, INIT_PROCESS
and USER_PROCESS entries, instead of just a single INIT_PROCESS entry.

With this change systemd may be used to not only invoke a getty directly
in a SysV-compliant way but alternatively also a login(1) implementation
or even forego getty and login entirely, and invoke arbitrary shells in
a way that they appear in who(1) or w(1).

This is preparation for a later commit that adds a "machinectl shell"
operation to invoke a shell in a container, in a way that is compatible
with who(1) and w(1).

											
										
										
											2015-08-23 13:14:04 +02:00
+								                                      context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
 								                                      context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
 								                                      USER_PROCESS,
-												core/execute: pass the username to utmp/wtmp database

Before previous commit, username would be NULL for root, and set only
for other users. So the argument passed to utmp_put_init_process()
would be "root" for other users and NULL for root. Seems strange.
Instead, always pass the username if available.

											
										
										
											2017-02-03 17:32:42 +01:00
+								                                      username);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
-												core: chown() any TTY used for stdin, not just when StandardInput=tty is used (#4347)

If stdin is supplied as an fd for transient units (using the
StandardInputFileDescriptor pseudo-property for transient units), then we
should also fix up the TTY ownership, not just when we opened the TTY
ourselves.

This simply drops the explicit is_terminal_input()-based check. Note that
chown_terminal() internally does a much more appropriate isatty()-based check
anyway, hence we can drop this without replacement.

Fixes: #4260
											
										
										
											2016-10-11 20:07:22 +02:00
+								        if (context->user) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                r = chown_terminal(STDIN_FILENO, uid);
 								                if (r < 0) {
 								                        *exit_status = EXIT_STDIN;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												cgroup: add cgroupsification

											
										
										
											2010-03-31 16:29:55 +02:00
-												cgroup-util: merge cg_set_tasks_access() and cg-set_group_access() into one

We never use these functions seperately, hence don't bother splitting
them into to.

Also, simplify things a bit, and maintain tables for the attribute files
to chown. Let's also update those tables a bit, and include thenew
"cgroup.threads" file in it, that needs to be delegated too, according
to the documentation.

											
										
										
											2017-11-24 18:30:23 +01:00
+								        /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
 								         * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
 								         * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
 								         * touch a single hierarchy too. */
-												execute: also fold the cgroup delegate bit into ExecFlags

											
										
										
											2017-08-01 10:51:18 +02:00
+								        if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
-												cgroup-util: merge cg_set_tasks_access() and cg-set_group_access() into one

We never use these functions seperately, hence don't bother splitting
them into to.

Also, simplify things a bit, and maintain tables for the attribute files
to chown. Let's also update those tables a bit, and include thenew
"cgroup.threads" file in it, that needs to be delegated too, according
to the documentation.

											
										
										
											2017-11-24 18:30:23 +01:00
+								                r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                if (r < 0) {
 								                        *exit_status = EXIT_CGROUP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								        for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
-												execute: add one more ExecFlags flag, for controlling unconditional directory chowning

Let's decouple the Manager object from the execution logic a bit more
here too, and simply pass along the fact whether we should
unconditionally chown the runtime/... directories via the ExecFlags
field too.

											
										
										
											2017-08-01 10:35:10 +02:00
+								                r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                if (r < 0)
 								                        return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								        r = build_environment(
-												core: bypass dynamic user lookups from dbus-daemon

dbus-daemon does NSS name look-ups in order to enforce its bus policy. This
might dead-lock if an NSS module use wants to use D-Bus for the look-up itself,
like our nss-systemd does. Let's work around this by bypassing bus
communication in the NSS module if we run inside of dbus-daemon. To make this
work we keep a bit of extra state in /run/systemd/dynamic-uid/ so that we don't
have to consult the bus, but can still resolve the names.

Note that the normal codepath continues to be via the bus, so that resolving
works from all mount namespaces and is subject to authentication, as before.

This is a bit dirty, but not too dirty, as dbus daemon is kinda special anyway
for PID 1.

											
										
										
											2016-08-02 12:28:51 +02:00
+								                        unit,
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								                        context,
 								                        params,
 								                        n_fds,
 								                        home,
 								                        username,
 								                        shell,
 								                        journal_stream_dev,
 								                        journal_stream_ino,
 								                        &our_env);
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        if (r < 0) {
 								                *exit_status = EXIT_MEMORY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_oom();
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        }
 								        r = build_pass_environment(context, &pass_env);
 								        if (r < 0) {
 								                *exit_status = EXIT_MEMORY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_oom();
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        }
 								        accum_env = strv_env_merge(5,
 								                                   params->environment,
 								                                   our_env,
 								                                   pass_env,
 								                                   context->environment,
 								                                   files_env,
 								                                   NULL);
 								        if (!accum_env) {
 								                *exit_status = EXIT_MEMORY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_oom();
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        }
-												execute: Cleanup the environment early

By cleaning up before setting up PAM we maintain control of overriding
behavior in setting variables. Otherwise, pam_putenv is in control.
This also makes sure we use a cleaned up environment in replacing
variables in argv.

											
										
										
											2016-07-07 12:36:33 +02:00
+								        accum_env = strv_env_clean(accum_env);
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
-												execute: drop group priviliges only after setting up namespace

If PrivateDevices=yes is set, the namespace code creates device nodes in /dev
that should be owned by the host's root, hence let's make sure we set up the
namespace before dropping group privileges.

											
										
										
											2016-08-25 17:29:12 +02:00
+								        (void) umask(context->umask);
-												exec: move mac_smack_apply_pid() and setup_pam() to same condition block

This cleans up exec_child() function by moving mac_smack_apply_pid()
and setup_pam() to the same condition block, since both of them have
the same condition (i.e params->apply_permissions). It improves
readability without changing its operation.

											
										
										
											2015-09-23 13:53:09 +02:00
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								        r = setup_keyring(unit, context, params, uid, gid);
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        if (r < 0) {
 								                *exit_status = EXIT_KEYRING;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
-												core: rename EXEC_APPLY_PERMISSIONS → EXEC_APPLY_SANDBOXING

"Permissions" was a bit of a misnomer, as it suggests that UNIX file
permission bits are adjusted, which aren't really changed here. Instead,
this is about UNIX credentials such as users or groups, as well as
namespacing, hence let's use a more generic term here, without any
misleading reference to UNIX file permissions: "sandboxing", which shall
refer to all kinds of sandboxing technologies, including UID/GID
dropping, selinux relabelling, namespacing, seccomp, and so on.

											
										
										
											2017-08-01 11:30:44 +02:00
+								        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
 								        needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
 								        if (needs_ambient_hack)
 								                needs_setuid = false;
 								        else
 								                needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
 								        if (needs_sandboxing) {
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
+								                /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
 								                 * present. The actual MAC context application will happen later, as late as possible, to avoid
 								                 * impacting our own code paths. */
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SELINUX
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                use_selinux = mac_selinux_use();
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
+								#endif
-												build-sys: s/HAVE_SMACK/ENABLE_SMACK/

Same justification as for HAVE_UTMP.

											
										
										
											2017-10-03 12:22:40 +02:00
+								#if ENABLE_SMACK
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                use_smack = mac_smack_use();
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
+								#endif
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_APPARMOR
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                use_apparmor = mac_apparmor_use();
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
+								#endif
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        }
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_setuid) {
 								                if (context->pam_name && username) {
 								                        r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_PAM;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                        }
 								                }
-												exec: move mac_smack_apply_pid() and setup_pam() to same condition block

This cleans up exec_child() function by moving mac_smack_apply_pid()
and setup_pam() to the same condition block, since both of them have
the same condition (i.e params->apply_permissions). It improves
readability without changing its operation.

											
										
										
											2015-09-23 13:53:09 +02:00
+								        }
-												core: add Personality= option for units to set the personality for spawned processes

											
										
										
											2014-02-19 02:15:24 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
-												namespace: fall back gracefully when kernel doesn't support network namespaces (#7024)


											
										
										
											2017-10-10 09:46:13 +02:00
+								                if (ns_type_supported(NAMESPACE_NET)) {
 								                        r = setup_netns(runtime->netns_storage_socket);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_NETWORK;
 								                                return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
 								                        }
 								                } else
 								                        log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												service: optionally, create INIT_PROCESS/DEAD_PROCESS entries for a service

This should fix accounting for pam_limits and suchlike.

https://bugzilla.redhat.com/show_bug.cgi?id=636036

											
										
										
											2010-10-08 16:06:23 +02:00
-												core: Private*/Protect* options with RootDirectory

When a service is chrooted with the option RootDirectory=/opt/..., then
the options PrivateDevices, PrivateTmp, ProtectHome, ProtectSystem must
mount the directories under $RootDirectory/{dev,tmp,home,usr,boot}.

The test-ns tool can test setup_namespace() with and without chroot:
 $ sudo TEST_NS_PROJECTS=/home/lennart/projects ./test-ns
 $ sudo TEST_NS_CHROOT=/home/alban/debian-tree TEST_NS_PROJECTS=/home/alban/debian-tree/home/alban/Documents ./test-ns

											
										
										
											2015-05-18 12:20:28 +02:00
+								        needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
 								        if (needs_mount_namespace) {
-												core: skip ReadOnlyPaths= and other permission-related mounts on PermissionsStartOnly= (#5309)

ReadOnlyPaths=, ProtectHome=, InaccessiblePaths= and ProtectSystem= are
about restricting access and little more, hence they should be disabled
if PermissionsStartOnly= is used or ExecStart= lines are prefixed with a
"+". Do that.

(Note that we will still create namespaces and stuff, since that's about
a lot more than just permissions. We'll simply disable the effect of
the four options mentioned above, but nothing else mount related.)

This also adds a test for this, to ensure this works as intended.

No documentation updates, as the documentation are already vague enough
to support the new behaviour ("If true, the permission-related execution
options…"). We could clarify this further, but I think we might want to
extend the switches' behaviour a bit more in future, hence leave it at
this for now.

Fixes: #5308
											
										
										
											2017-02-12 06:44:46 +01:00
+								                r = apply_mount_namespace(unit, command, context, params, runtime);
-												execute: if RuntimeDirectory= is set, it should be writable

Implicitly make all dirs set with RuntimeDirectory= writable, as the concept
otherwise makes no sense.

											
										
										
											2016-08-25 10:42:38 +02:00
+								                if (r < 0) {
 								                        *exit_status = EXIT_NAMESPACE;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
-												execute: if RuntimeDirectory= is set, it should be writable

Implicitly make all dirs set with RuntimeDirectory= writable, as the concept
otherwise makes no sense.

											
										
										
											2016-08-25 10:42:38 +02:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: lets apply working directory just after mount namespaces

This makes applying groups after applying the working directory, this
may allow some flexibility but at same it is not a big deal since we
don't execute or do anything between applying working directory and
droping groups.

											
										
										
											2016-10-25 16:24:35 +02:00
+								        /* Apply just after mount namespace setup */
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								        r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								        if (r < 0)
 								                return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
-												core: lets apply working directory just after mount namespaces

This makes applying groups after applying the working directory, this
may allow some flexibility but at same it is not a big deal since we
don't execute or do anything between applying working directory and
droping groups.

											
										
										
											2016-10-25 16:24:35 +02:00
-												core: initialize groups list before checking SupplementaryGroups= of a unit (#4533)

Always initialize the supplementary groups of caller before checking the
unit SupplementaryGroups= option.

Fixes https://github.com/systemd/systemd/issues/4531
											
										
										
											2016-11-02 17:51:35 +01:00
+								        /* Drop groups as early as possbile */
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_setuid) {
-												core: cleanup for enforce_groups() (#7064)

SupplementaryGroups= is preprocessed in get_supplementary_groups().
So, it is not necessary to input ExecContext to enforce_groups().
											
										
										
											2017-10-12 08:10:25 +02:00
+								                r = enforce_groups(gid, supplementary_gids, ngids);
-												execute: drop group priviliges only after setting up namespace

If PrivateDevices=yes is set, the namespace code creates device nodes in /dev
that should be owned by the host's root, hence let's make sure we set up the
namespace before dropping group privileges.

											
										
										
											2016-08-25 17:29:12 +02:00
+								                if (r < 0) {
 								                        *exit_status = EXIT_GROUP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
-												execute: drop group priviliges only after setting up namespace

If PrivateDevices=yes is set, the namespace code creates device nodes in /dev
that should be owned by the host's root, hence let's make sure we set up the
namespace before dropping group privileges.

											
										
										
											2016-08-25 17:29:12 +02:00
+								                }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        }
-												execute: drop group priviliges only after setting up namespace

If PrivateDevices=yes is set, the namespace code creates device nodes in /dev
that should be owned by the host's root, hence let's make sure we set up the
namespace before dropping group privileges.

											
										
										
											2016-08-25 17:29:12 +02:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_sandboxing) {
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SELINUX
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
-												execute: simplify needs_sandboxing checking

Let's merge three if blocks that shall only run when sandboxing is applied
into one.

Note that this changes behaviour in one corner case: PrivateUsers=1 is
now honours both PermissionsStartOnly= and the "+" modifier in
ExecStart=, and not just the former, as before. This was an oversight,
so let's fix this now, at a point in time the option isn't used much
yet.

											
										
										
											2017-08-01 11:44:37 +02:00
+								                        r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_SELINUX_CONTEXT;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
-												execute: simplify needs_sandboxing checking

Let's merge three if blocks that shall only run when sandboxing is applied
into one.

Note that this changes behaviour in one corner case: PrivateUsers=1 is
now honours both PermissionsStartOnly= and the "+" modifier in
ExecStart=, and not just the former, as before. This was an oversight,
so let's fix this now, at a point in time the option isn't used much
yet.

											
										
										
											2017-08-01 11:44:37 +02:00
+								                        }
-												selinux: figure out selinux context applied on exec() before closing all fds

We need original socket_fd around otherwise mac_selinux_get_child_mls_label
fails with -EINVAL return code. Also don't call setexeccon twice but rather pass
context value of SELinuxContext option as an extra argument.

											
										
										
											2014-11-12 13:53:27 +01:00
+								                }
 								#endif
-												execute: simplify needs_sandboxing checking

Let's merge three if blocks that shall only run when sandboxing is applied
into one.

Note that this changes behaviour in one corner case: PrivateUsers=1 is
now honours both PermissionsStartOnly= and the "+" modifier in
ExecStart=, and not just the former, as before. This was an oversight,
so let's fix this now, at a point in time the option isn't used much
yet.

											
										
										
											2017-08-01 11:44:37 +02:00
+								                if (context->private_users) {
 								                        r = setup_private_users(uid, gid);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
-												execute: simplify needs_sandboxing checking

Let's merge three if blocks that shall only run when sandboxing is applied
into one.

Note that this changes behaviour in one corner case: PrivateUsers=1 is
now honours both PermissionsStartOnly= and the "+" modifier in
ExecStart=, and not just the former, as before. This was an oversight,
so let's fix this now, at a point in time the option isn't used much
yet.

											
										
										
											2017-08-01 11:44:37 +02:00
+								                        }
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								                }
 								        }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
-												core: introduce new Type=exec service type

Users are often surprised that "systemd-run" command lines like
"systemd-run -p User=idontexist /bin/true" will return successfully,
even though the logs show that the process couldn't be invoked, as the
user "idontexist" doesn't exist. This is because Type=simple will only
wait until fork() succeeded before returning start-up success.

This patch adds a new service type Type=exec, which is very similar to
Type=simple, but waits until the child process completed the execve()
before returning success. It uses a pipe that has O_CLOEXEC set for this
logic, so that the kernel automatically sends POLLHUP on it when the
execve() succeeded but leaves the pipe open if not. This means PID 1
waits exactly until the execve() succeeded in the child, and not longer
and not shorter, which is the desired functionality.

Making use of this new functionality, the command line
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
as expected.

											
										
										
											2018-07-17 11:47:14 +02:00
+								         * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
 								         * however if we have it as we want to keep it open until the final execve(). */
 								        if (params->exec_fd >= 0) {
 								                exec_fd = params->exec_fd;
 								                if (exec_fd < 3 + (int) n_fds) {
 								                        int moved_fd;
 								                        /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
 								                         * process we are about to execute. */
 								                        moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
 								                        if (moved_fd < 0) {
 								                                *exit_status = EXIT_FDS;
 								                                return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
 								                        }
 								                        safe_close(exec_fd);
 								                        exec_fd = moved_fd;
 								                } else {
 								                        /* This fd should be FD_CLOEXEC already, but let's make sure. */
 								                        r = fd_cloexec(exec_fd, true);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_FDS;
 								                                return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
 								                        }
 								                }
 								                fds_with_exec_fd = newa(int, n_fds + 1);
-												core: use memcpy_safe()

Fixes #9738.

											
										
										
											2018-08-08 08:52:46 +02:00
+								                memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
-												core: introduce new Type=exec service type

Users are often surprised that "systemd-run" command lines like
"systemd-run -p User=idontexist /bin/true" will return successfully,
even though the logs show that the process couldn't be invoked, as the
user "idontexist" doesn't exist. This is because Type=simple will only
wait until fork() succeeded before returning start-up success.

This patch adds a new service type Type=exec, which is very similar to
Type=simple, but waits until the child process completed the execve()
before returning success. It uses a pipe that has O_CLOEXEC set for this
logic, so that the kernel automatically sends POLLHUP on it when the
execve() succeeded but leaves the pipe open if not. This means PID 1
waits exactly until the execve() succeeded in the child, and not longer
and not shorter, which is the desired functionality.

Making use of this new functionality, the command line
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
as expected.

											
										
										
											2018-07-17 11:47:14 +02:00
+								                fds_with_exec_fd[n_fds] = exec_fd;
 								                n_fds_with_exec_fd = n_fds + 1;
 								        } else {
 								                fds_with_exec_fd = fds;
 								                n_fds_with_exec_fd = n_fds;
 								        }
 								        r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r >= 0)
 								                r = shift_fds(fds, n_fds);
 								        if (r >= 0)
-												core: swap order of "n_storage_fds" and "n_socket_fds" parameters

When process fd lists to pass to activated programs we always place the
socket activation fds first, and the storage fds last. Irritatingly in
almost all calls the "n_storage_fds" parameter (i.e. the number of
storage fds to pass) came first so far, and the "n_socket_fds" parameter
second. Let's clean this up, and specify the number of fds in the order
the fds themselves are passed.

(Also, let's fix one more case where "unsigned" was used to size an
array, while we should use "size_t" instead.)

											
										
										
											2018-07-05 09:56:54 +02:00
+								                r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r < 0) {
 								                *exit_status = EXIT_FDS;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
-												core: introduce new Type=exec service type

Users are often surprised that "systemd-run" command lines like
"systemd-run -p User=idontexist /bin/true" will return successfully,
even though the logs show that the process couldn't be invoked, as the
user "idontexist" doesn't exist. This is because Type=simple will only
wait until fork() succeeded before returning start-up success.

This patch adds a new service type Type=exec, which is very similar to
Type=simple, but waits until the child process completed the execve()
before returning success. It uses a pipe that has O_CLOEXEC set for this
logic, so that the kernel automatically sends POLLHUP on it when the
execve() succeeded but leaves the pipe open if not. This means PID 1
waits exactly until the execve() succeeded in the child, and not longer
and not shorter, which is the desired functionality.

Making use of this new functionality, the command line
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
as expected.

											
										
										
											2018-07-17 11:47:14 +02:00
+								        /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
 								         * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
 								         * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
 								         * came this far. */
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        secure_bits = context->secure_bits;
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_sandboxing) {
 								                uint64_t bset;
-												rlimit-util: introduce setrlimit_closest_all()

This new call applies all configured resource limits in one.

											
										
										
											2018-05-03 19:13:27 +02:00
+								                int which_failed;
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
-												rlimit-util: introduce setrlimit_closest_all()

This new call applies all configured resource limits in one.

											
										
										
											2018-05-03 19:13:27 +02:00
+								                r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
 								                if (r < 0) {
 								                        *exit_status = EXIT_LIMITS;
 								                        return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								                }
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								                /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
 								                if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
 								                        if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
 								                                *exit_status = EXIT_LIMITS;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								                        }
 								                }
-												core/exec: Restore SmackProcessLabel setting (#7378)

Smack LSM needs the capability CAP_MAC_ADMIN to allow
setting of the current Smack exec label. Consequently,
dropping capabilities must be done after changing the
current exec label.

This is only related to Smack LSM. But for clarity and
regularity, all setting of security context moved before
dropping capabilities.

See Issue 7108
											
										
										
											2017-11-21 12:01:13 +01:00
+								#if ENABLE_SMACK
 								                /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
 								                 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
 								                if (use_smack) {
 								                        r = setup_smack(context, command);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
 								                                return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
 								                        }
 								                }
 								#endif
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                bset = context->capability_bounding_set;
 								                /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
 								                 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
 								                 * instead of us doing that */
 								                if (needs_ambient_hack)
 								                        bset |= (UINT64_C(1) << CAP_SETPCAP) |
 								                                (UINT64_C(1) << CAP_SETUID) |
 								                                (UINT64_C(1) << CAP_SETGID);
 								                if (!cap_test_all(bset)) {
 								                        r = capability_bounding_set_drop(bset, false);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        if (r < 0) {
 								                                *exit_status = EXIT_CAPABILITIES;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
-												execute: do initgroups() first, pam initialization second so that it can still modify the groups list

											
										
										
											2011-06-30 02:15:01 +02:00
+								                        }
-												execute: log errors from "sd(EXEC)"

To give the administrator more hints about failures occuring in spawning
of commands than just the exit code, log the strerror.
All fds are closed, so reopen the log.

Related-to: https://bugzilla.redhat.com/show_bug.cgi?id=752901

											
										
										
											2011-11-17 00:21:16 +01:00
+								                }
-												execute: do initgroups() first, pam initialization second so that it can still modify the groups list

											
										
										
											2011-06-30 02:15:01 +02:00
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								                /* This is done before enforce_user, but ambient set
 								                 * does not survive over setresuid() if keep_caps is not set. */
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                if (!needs_ambient_hack &&
 								                    context->capability_ambient_set != 0) {
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								                        r = capability_ambient_set_apply(context->capability_ambient_set, true);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_CAPABILITIES;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								                        }
 								                }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        }
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_setuid) {
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                if (context->user) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        r = enforce_user(context, uid);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                        }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
 								                        if (!needs_ambient_hack &&
 								                            context->capability_ambient_set != 0) {
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
 								                                /* Fix the ambient capabilities after user change. */
 								                                r = capability_ambient_set_apply(context->capability_ambient_set, false);
 								                                if (r < 0) {
 								                                        *exit_status = EXIT_CAPABILITIES;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                        return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								                                }
 								                                /* If we were asked to change user and ambient capabilities
 								                                 * were requested, we had to add keep-caps to the securebits
 								                                 * so that we would maintain the inherited capability set
 								                                 * through the setresuid(). Make sure that the bit is added
 								                                 * also to the context secure_bits so that we don't try to
 								                                 * drop the bit away next. */
-												tree-wide: indentation fixes

											
										
										
											2016-02-25 00:27:56 +01:00
+								                                secure_bits |= 1<<SECURE_KEEP_CAPS;
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								                        }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_sandboxing) {
-												core/exec: Restore SmackProcessLabel setting (#7378)

Smack LSM needs the capability CAP_MAC_ADMIN to allow
setting of the current Smack exec label. Consequently,
dropping capabilities must be done after changing the
current exec label.

This is only related to Smack LSM. But for clarity and
regularity, all setting of security context moved before
dropping capabilities.

See Issue 7108
											
										
										
											2017-11-21 12:01:13 +01:00
+								                /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
 								                 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
 								                 * are restricted. */
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SELINUX
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                if (use_selinux) {
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                        char *exec_context = mac_selinux_context_net ?: context->selinux_context;
 								                        if (exec_context) {
 								                                r = setexeccon(exec_context);
 								                                if (r < 0) {
 								                                        *exit_status = EXIT_SELINUX_CONTEXT;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                        return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                                }
 								                        }
 								                }
 								#endif
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_APPARMOR
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                if (use_apparmor && context->apparmor_profile) {
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                        r = aa_change_onexec(context->apparmor_profile);
 								                        if (r < 0 && !context->apparmor_profile_ignore) {
 								                                *exit_status = EXIT_APPARMOR_PROFILE;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                        }
 								                }
 								#endif
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
 								                 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								                if (prctl(PR_GET_SECUREBITS) != secure_bits)
 								                        if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                                *exit_status = EXIT_SECUREBITS;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
-												exec: introduce PrivateNetwork= process option to turn off network access to specific services

											
										
										
											2011-08-02 05:24:58 +02:00
+								                        }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                if (context_has_no_new_privileges(context))
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                        if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                                *exit_status = EXIT_NO_NEW_PRIVILEGES;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                        }
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                r = apply_address_families(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_ADDRESS_FAMILIES;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
-												execute: log errors from "sd(EXEC)"

To give the administrator more hints about failures occuring in spawning
of commands than just the exit code, log the strerror.
All fds are closed, so reopen the log.

Related-to: https://bugzilla.redhat.com/show_bug.cgi?id=752901

											
										
										
											2011-11-17 00:21:16 +01:00
+								                }
-												execute: setup namespace after doing NSS calls

											
										
										
											2010-06-16 16:39:28 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                r = apply_memory_deny_write_execute(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
+								                }
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                r = apply_restrict_realtime(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								                }
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								                r = apply_restrict_namespaces(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								                }
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                r = apply_protect_sysctl(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
-												core:sandbox: Add ProtectKernelModules= option

This is useful to turn off explicit module load and unload operations on modular
kernels. This option removes CAP_SYS_MODULE from the capability bounding set for
the unit, and installs a system call filter to block module system calls.

This option will not prevent the kernel from loading modules using the module
auto-load feature which is a system wide operation.

											
										
										
											2016-10-12 13:31:21 +02:00
+								                }
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                r = apply_protect_kernel_modules(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                }
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                r = apply_private_devices(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                }
 								                r = apply_syscall_archs(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
-												execute: filter low-level I/O syscalls if PrivateDevices= is set

If device access is restricted via PrivateDevices=, let's also block the
various low-level I/O syscalls at the same time, so that we know that the
minimal set of devices in our virtualized /dev are really everything the unit
can access.

											
										
										
											2016-08-26 16:39:04 +02:00
+								                }
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
+								                r = apply_lock_personality(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
+								                }
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                /* This really should remain the last step before the execve(), to make sure our own code is unaffected
 								                 * by the filter as little as possible. */
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                r = apply_syscall_filter(unit, context, needs_ambient_hack);
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                }
 								#endif
 								        }
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
+								        if (!strv_isempty(context->unset_environment)) {
 								                char **ee = NULL;
 								                ee = strv_env_delete(accum_env, 1, context->unset_environment);
 								                if (!ee) {
 								                        *exit_status = EXIT_MEMORY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_oom();
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
+								                }
-												tree-wide: use strv_free_and_replace() macro

											
										
										
											2018-05-09 17:34:46 +02:00
+								                strv_free_and_replace(accum_env, ee);
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
+								        }
-												core: drop "argv" field from ExecParameter structure

We always initialize it from the same field in ExecCommand anyway, hence
there's no point in passing it separately to exec_spawn(), after all we
already pass the ExecCommand structure itself anyway.

No change in behaviour.

											
										
										
											2018-07-17 18:47:32 +02:00
+								        final_argv = replace_env_argv(command->argv, accum_env);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (!final_argv) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                *exit_status = EXIT_MEMORY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_oom();
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												tree-wide: add DEBUG_LOGGING macro that checks whether debug logging is on (#7645)

This makes things a bit easier to read I think, and also makes sure we
always use the _unlikely_ wrapper around it, which so far we used
sometimes and other times we didn't. Let's clean that up.
											
										
										
											2017-12-15 11:09:00 +01:00
+								        if (DEBUG_LOGGING) {
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                _cleanup_free_ char *line;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                line = exec_command_line(final_argv);
-												basic/log: add the log_struct terminator to macro

This way all callers do not need to specify it.
Exhaustively tested by running test-log under valgrind ;)

											
										
										
											2018-06-04 12:59:22 +02:00
+								                if (line)
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        log_struct(LOG_DEBUG,
 								                                   "EXECUTABLE=%s", command->path,
 								                                   LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
-												tree-wide: mark log_struct with _printf_ and fix fallout

log_struct takes multiple format strings, each one followed by arguments.
The _printf_ annotation is not sufficiently flexible to express this,
but we can still annotate the first format string, though not its
arguments (because their number is unknown).

With the annotation, the places which specified the message id or similar
as the first pattern cause a warning from -Wformat-nonliteral. This can
be trivially fixed by putting the MESSAGE= first.

This change will help find issues where a non-literal is erroneously used
as the pattern.

											
										
										
											2017-04-20 20:15:28 +02:00
+								                                   LOG_UNIT_ID(unit),
-												basic/log: add the log_struct terminator to macro

This way all callers do not need to specify it.
Exhaustively tested by running test-log under valgrind ;)

											
										
										
											2018-06-04 12:59:22 +02:00
+								                                   LOG_UNIT_INVOCATION_ID(unit));
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												core: when we cannot add PID to a scope cgroup, log about it

Also, place the scope unit in failed state.

											
										
										
											2015-04-28 12:20:29 +02:00
-												core: introduce new Type=exec service type

Users are often surprised that "systemd-run" command lines like
"systemd-run -p User=idontexist /bin/true" will return successfully,
even though the logs show that the process couldn't be invoked, as the
user "idontexist" doesn't exist. This is because Type=simple will only
wait until fork() succeeded before returning start-up success.

This patch adds a new service type Type=exec, which is very similar to
Type=simple, but waits until the child process completed the execve()
before returning success. It uses a pipe that has O_CLOEXEC set for this
logic, so that the kernel automatically sends POLLHUP on it when the
execve() succeeded but leaves the pipe open if not. This means PID 1
waits exactly until the execve() succeeded in the child, and not longer
and not shorter, which is the desired functionality.

Making use of this new functionality, the command line
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
as expected.

											
										
										
											2018-07-17 11:47:14 +02:00
+								        if (exec_fd >= 0) {
 								                uint8_t hot = 1;
 								                /* We have finished with all our initializations. Let's now let the manager know that. From this point
 								                 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
 								                if (write(exec_fd, &hot, sizeof(hot)) < 0) {
 								                        *exit_status = EXIT_EXEC;
 								                        return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
 								                }
 								        }
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        execve(command->path, final_argv, accum_env);
-												core: introduce new Type=exec service type

Users are often surprised that "systemd-run" command lines like
"systemd-run -p User=idontexist /bin/true" will return successfully,
even though the logs show that the process couldn't be invoked, as the
user "idontexist" doesn't exist. This is because Type=simple will only
wait until fork() succeeded before returning start-up success.

This patch adds a new service type Type=exec, which is very similar to
Type=simple, but waits until the child process completed the execve()
before returning success. It uses a pipe that has O_CLOEXEC set for this
logic, so that the kernel automatically sends POLLHUP on it when the
execve() succeeded but leaves the pipe open if not. This means PID 1
waits exactly until the execve() succeeded in the child, and not longer
and not shorter, which is the desired functionality.

Making use of this new functionality, the command line
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
as expected.

											
										
										
											2018-07-17 11:47:14 +02:00
+								        r = -errno;
 								        if (exec_fd >= 0) {
 								                uint8_t hot = 0;
 								                /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
 								                 * that POLLHUP on it no longer means execve() succeeded. */
 								                if (write(exec_fd, &hot, sizeof(hot)) < 0) {
 								                        *exit_status = EXIT_EXEC;
 								                        return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
 								                }
 								        }
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
-												core: introduce new Type=exec service type

Users are often surprised that "systemd-run" command lines like
"systemd-run -p User=idontexist /bin/true" will return successfully,
even though the logs show that the process couldn't be invoked, as the
user "idontexist" doesn't exist. This is because Type=simple will only
wait until fork() succeeded before returning start-up success.

This patch adds a new service type Type=exec, which is very similar to
Type=simple, but waits until the child process completed the execve()
before returning success. It uses a pipe that has O_CLOEXEC set for this
logic, so that the kernel automatically sends POLLHUP on it when the
execve() succeeded but leaves the pipe open if not. This means PID 1
waits exactly until the execve() succeeded in the child, and not longer
and not shorter, which is the desired functionality.

Making use of this new functionality, the command line
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
as expected.

											
										
										
											2018-07-17 11:47:14 +02:00
+								        if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
 								                log_struct_errno(LOG_INFO, r,
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
 								                                 LOG_UNIT_ID(unit),
 								                                 LOG_UNIT_INVOCATION_ID(unit),
 								                                 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
 								                                                  command->path),
-												basic/log: add the log_struct terminator to macro

This way all callers do not need to specify it.
Exhaustively tested by running test-log under valgrind ;)

											
										
										
											2018-06-04 12:59:22 +02:00
+								                                 "EXECUTABLE=%s", command->path);
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return 0;
 								        }
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        *exit_status = EXIT_EXEC;
-												core: introduce new Type=exec service type

Users are often surprised that "systemd-run" command lines like
"systemd-run -p User=idontexist /bin/true" will return successfully,
even though the logs show that the process couldn't be invoked, as the
user "idontexist" doesn't exist. This is because Type=simple will only
wait until fork() succeeded before returning start-up success.

This patch adds a new service type Type=exec, which is very similar to
Type=simple, but waits until the child process completed the execve()
before returning success. It uses a pipe that has O_CLOEXEC set for this
logic, so that the kernel automatically sends POLLHUP on it when the
execve() succeeded but leaves the pipe open if not. This means PID 1
waits exactly until the execve() succeeded in the child, and not longer
and not shorter, which is the desired functionality.

Making use of this new functionality, the command line
"systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail,
as expected.

											
										
										
											2018-07-17 11:47:14 +02:00
+								        return log_unit_error_errno(unit, r, "Failed to execute command: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								}
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
 								static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								int exec_spawn(Unit *unit,
 								               ExecCommand *command,
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								               const ExecContext *context,
 								               const ExecParameters *params,
 								               ExecRuntime *runtime,
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								               DynamicCreds *dcreds,
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								               pid_t *ret) {
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
-												core: drop "argv" field from ExecParameter structure

We always initialize it from the same field in ExecCommand anyway, hence
there's no point in passing it separately to exec_spawn(), after all we
already pass the ExecCommand structure itself anyway.

No change in behaviour.

											
										
										
											2018-07-17 18:47:32 +02:00
+								        int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        _cleanup_strv_free_ char **files_env = NULL;
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								        size_t n_storage_fds = 0, n_socket_fds = 0;
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        _cleanup_free_ char *line = NULL;
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        pid_t pid;
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        assert(unit);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        assert(command);
 								        assert(context);
 								        assert(ret);
 								        assert(params);
-												core: swap order of "n_storage_fds" and "n_socket_fds" parameters

When process fd lists to pass to activated programs we always place the
socket activation fds first, and the storage fds last. Irritatingly in
almost all calls the "n_storage_fds" parameter (i.e. the number of
storage fds to pass) came first so far, and the "n_socket_fds" parameter
second. Let's clean this up, and specify the number of fds in the order
the fds themselves are passed.

(Also, let's fix one more case where "unsigned" was used to size an
array, while we should use "size_t" instead.)

											
										
										
											2018-07-05 09:56:54 +02:00
+								        assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->std_input == EXEC_INPUT_SOCKET ||
 								            context->std_output == EXEC_OUTPUT_SOCKET ||
 								            context->std_error == EXEC_OUTPUT_SOCKET) {
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								                if (params->n_socket_fds > 1) {
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        log_unit_error(unit, "Got more than one socket.");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                        return -EINVAL;
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                }
-												core: Add AppArmor profile switching

This permit to switch to a specific apparmor profile when starting a daemon. This
will result in a non operation if apparmor is disabled.
It also add a new build requirement on libapparmor for using this feature.

											
										
										
											2014-02-20 16:19:44 +01:00
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								                if (params->n_socket_fds == 0) {
-												execute: Properly log errors considering socket fds (#5910)

Till now if the params->n_fds was 0, systemd was logging that there were
more than one sockets.

Thanks @gregoryp and @VFXcode who did the most work debugging this.
											
										
										
											2017-05-09 01:09:22 +02:00
+								                        log_unit_error(unit, "Got no socket.");
 								                        return -EINVAL;
 								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                socket_fd = params->fds[0];
 								        } else {
 								                socket_fd = -1;
 								                fds = params->fds;
-												core: only apply NonBlocking= to fds passed via socket activation

Make sure to only apply the O_NONBLOCK flag to the fds passed via socket
activation.

Previously the flag was also applied to the fds which came from the fd store
but this was incorrect since services, after being restarted, expect that these
passed fds have their flags unchanged and can be reused as before.

The documentation was a bit unclear about this so clarify it.

											
										
										
											2017-05-12 11:32:53 +02:00
+								                n_socket_fds = params->n_socket_fds;
-												core: swap order of "n_storage_fds" and "n_socket_fds" parameters

When process fd lists to pass to activated programs we always place the
socket activation fds first, and the storage fds last. Irritatingly in
almost all calls the "n_storage_fds" parameter (i.e. the number of
storage fds to pass) came first so far, and the "n_socket_fds" parameter
second. Let's clean this up, and specify the number of fds in the order
the fds themselves are passed.

(Also, let's fix one more case where "unsigned" was used to size an
array, while we should use "size_t" instead.)

											
										
										
											2018-07-05 09:56:54 +02:00
+								                n_storage_fds = params->n_storage_fds;
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								        r = exec_context_named_iofds(context, params, named_iofds);
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        if (r < 0)
 								                return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        r = exec_context_load_environment(unit, context, &files_env);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r < 0)
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core: drop "argv" field from ExecParameter structure

We always initialize it from the same field in ExecCommand anyway, hence
there's no point in passing it separately to exec_spawn(), after all we
already pass the ExecCommand structure itself anyway.

No change in behaviour.

											
										
										
											2018-07-17 18:47:32 +02:00
+								        line = exec_command_line(command->argv);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (!line)
 								                return log_oom();
-												execute: support minimal environment variable replacement when executing processes

											
										
										
											2010-07-08 04:09:59 +02:00
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        log_struct(LOG_DEBUG,
 								                   LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
 								                   "EXECUTABLE=%s", command->path,
-												tree-wide: mark log_struct with _printf_ and fix fallout

log_struct takes multiple format strings, each one followed by arguments.
The _printf_ annotation is not sufficiently flexible to express this,
but we can still annotate the first format string, though not its
arguments (because their number is unknown).

With the annotation, the places which specified the message id or similar
as the first pattern cause a warning from -Wformat-nonliteral. This can
be trivially fixed by putting the MESSAGE= first.

This change will help find issues where a non-literal is erroneously used
as the pattern.

											
										
										
											2017-04-20 20:15:28 +02:00
+								                   LOG_UNIT_ID(unit),
-												basic/log: add the log_struct terminator to macro

This way all callers do not need to specify it.
Exhaustively tested by running test-log under valgrind ;)

											
										
										
											2018-06-04 12:59:22 +02:00
+								                   LOG_UNIT_INVOCATION_ID(unit));
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        pid = fork();
 								        if (pid < 0)
-												core:execute: fix fork() fail handling in exec_spawn()

    If pid < 0 after fork(), 0 is always returned because r =
    exec_context_load_environment() has exited successfully.

    This will make the caller of exec_spawn() not able to handle
    the fork() error case and make systemd abort assert() possibly.

											
										
										
											2015-11-26 04:46:40 +01:00
+								                return log_unit_error_errno(unit, errno, "Failed to fork: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
 								        if (pid == 0) {
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                int exit_status = EXIT_SUCCESS;
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                r = exec_child(unit,
 								                               command,
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                               context,
 								                               params,
 								                               runtime,
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								                               dcreds,
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                               socket_fd,
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                               named_iofds,
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								                               fds,
-												core: only apply NonBlocking= to fds passed via socket activation

Make sure to only apply the O_NONBLOCK flag to the fds passed via socket
activation.

Previously the flag was also applied to the fds which came from the fd store
but this was incorrect since services, after being restarted, expect that these
passed fds have their flags unchanged and can be reused as before.

The documentation was a bit unclear about this so clarify it.

											
										
										
											2017-05-12 11:32:53 +02:00
+								                               n_socket_fds,
-												core: swap order of "n_storage_fds" and "n_socket_fds" parameters

When process fd lists to pass to activated programs we always place the
socket activation fds first, and the storage fds last. Irritatingly in
almost all calls the "n_storage_fds" parameter (i.e. the number of
storage fds to pass) came first so far, and the "n_socket_fds" parameter
second. Let's clean this up, and specify the number of fds in the order
the fds themselves are passed.

(Also, let's fix one more case where "unsigned" was used to size an
array, while we should use "size_t" instead.)

											
										
										
											2018-07-05 09:56:54 +02:00
+								                               n_storage_fds,
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                               files_env,
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								                               unit->manager->user_lookup_fds[1],
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                               &exit_status);
-												basic/log: add the log_struct terminator to macro

This way all callers do not need to specify it.
Exhaustively tested by running test-log under valgrind ;)

											
										
										
											2018-06-04 12:59:22 +02:00
+								                if (r < 0)
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        log_struct_errno(LOG_ERR, r,
 								                                         "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
 								                                         LOG_UNIT_ID(unit),
 								                                         LOG_UNIT_INVOCATION_ID(unit),
 								                                         LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
 								                                                          exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
 								                                                          command->path),
-												basic/log: add the log_struct terminator to macro

This way all callers do not need to specify it.
Exhaustively tested by running test-log under valgrind ;)

											
										
										
											2018-06-04 12:59:22 +02:00
+								                                         "EXECUTABLE=%s", command->path);
-												execute: log errors from "sd(EXEC)"

To give the administrator more hints about failures occuring in spawning
of commands than just the exit code, log the strerror.
All fds are closed, so reopen the log.

Related-to: https://bugzilla.redhat.com/show_bug.cgi?id=752901

											
										
										
											2011-11-17 00:21:16 +01:00
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                _exit(exit_status);
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        }
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
-												systemd: use structured logging for unit changes

Information which unit a log entry pertains to enables systemctl
status to display more log messages.

											
										
										
											2012-10-11 00:11:24 +02:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        /* We add the new process to the cgroup both in the child (so
 								         * that we can be sure that no user code is ever executed
 								         * outside of the cgroup) and in the parent (so that we can be
 								         * sure that when we kill the cgroup the process will be
 								         * killed too). */
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (params->cgroup_path)
-												core: when we cannot add PID to a scope cgroup, log about it

Also, place the scope unit in failed state.

											
										
										
											2015-04-28 12:20:29 +02:00
+								                (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
-												fix an assert when forking

											
										
										
											2010-01-27 05:30:58 +01:00
-												dbus: complete exec status coverage

											
										
										
											2010-07-04 18:49:58 +02:00
+								        exec_status_start(&command->exec_status, pid);
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        *ret = pid;
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								        return 0;
 								}
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								void exec_context_init(ExecContext *c) {
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        ExecDirectoryType i;
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        assert(c);
-												umask: change default umask to 0022 just to be sure, and set it explicitly in all binaries, in order to make sure it is set when started from the terminal

											
										
										
											2011-08-01 20:52:18 +02:00
+								        c->umask = 0022;
-												support chrooting/setting of ioprio when spawning

											
										
										
											2010-01-29 20:46:22 +01:00
+								        c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        c->cpu_sched_policy = SCHED_OTHER;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
+								        c->syslog_priority = LOG_DAEMON|LOG_INFO;
-												turn negative options into positive options

											
										
										
											2010-07-05 01:08:13 +02:00
+								        c->syslog_level_prefix = true;
-												service: ignore SIGPIPE by default

											
										
										
											2012-02-09 03:18:04 +01:00
+								        c->ignore_sigpipe = true;
-												time-util: add and use USEC/NSEC_INFINIY

											
										
										
											2014-07-29 12:23:31 +02:00
+								        c->timer_slack_nsec = NSEC_INFINITY;
-												util: introduce PERSONALITY_INVALID as macro for 0xffffffffLU

											
										
										
											2015-05-21 19:48:49 +02:00
+								        c->personality = PERSONALITY_INVALID;
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								        for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                c->directories[i].mode = 0755;
-												capabilities: keep bounding set in non-inverted format.

Change the capability bounding set parser and logic so that the bounding
set is kept as a positive set internally. This means that the set
reflects those capabilities that we want to keep instead of drop.

											
										
										
											2016-01-07 23:00:04 +01:00
+								        c->capability_bounding_set = CAP_ALL;
-												load-fragment: allow to specify RestrictNamespaces= multiple times

If multiple RestrictNamespaces= settings are set, then merge the settings.
This also drops supporting "~yes" and "~no".

											
										
										
											2018-05-01 03:36:39 +02:00
+								        assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
 								        c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
-												core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald

And let's make use of it to implement two new unit settings with it:

1. LogLevelMax= is a new per-unit setting that may be used to configure
   log priority filtering: set it to LogLevelMax=notice and only
   messages of level "notice" and lower (i.e. more important) will be
   processed, all others are dropped.

2. LogExtraFields= is a new per-unit setting for configuring per-unit
   journal fields, that are implicitly included in every log record
   generated by the unit's processes. It takes field/value pairs in the
   form of FOO=BAR.

Also, related to this, one exisiting unit setting is ported to this new
facility:

3. The invocation ID is now pulled from /run/systemd/units/ instead of
   cgroupfs xattrs. This substantially relaxes requirements of systemd
   on the kernel version and the privileges it runs with (specifically,
   cgroupfs xattrs are not available in containers, since they are
   stored in kernel memory, and hence are unsafe to permit to lesser
   privileged code).

/run/systemd/units/ is a new directory, which contains a number of files
and symlinks encoding the above information. PID 1 creates and manages
these files, and journald reads them from there.

Note that this is supposed to be a direct path between PID 1 and the
journal only, due to the special runtime environment the journal runs
in. Normally, today we shouldn't introduce new interfaces that (mis-)use
a file system as IPC framework, and instead just an IPC system, but this
is very hard to do between the journal and PID 1, as long as the IPC
system is a subject PID 1 manages, and itself a client to the journal.

This patch cleans up a couple of types used in journal code:
specifically we switch to size_t for a couple of memory-sizing values,
as size_t is the right choice for everything that is memory.

Fixes: #4089
Fixes: #3041
Fixes: #4441

											
										
										
											2017-11-02 19:43:32 +01:00
+								        c->log_level_max = -1;
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								}
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								void exec_context_done(ExecContext *c) {
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        ExecDirectoryType i;
-												core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald

And let's make use of it to implement two new unit settings with it:

1. LogLevelMax= is a new per-unit setting that may be used to configure
   log priority filtering: set it to LogLevelMax=notice and only
   messages of level "notice" and lower (i.e. more important) will be
   processed, all others are dropped.

2. LogExtraFields= is a new per-unit setting for configuring per-unit
   journal fields, that are implicitly included in every log record
   generated by the unit's processes. It takes field/value pairs in the
   form of FOO=BAR.

Also, related to this, one exisiting unit setting is ported to this new
facility:

3. The invocation ID is now pulled from /run/systemd/units/ instead of
   cgroupfs xattrs. This substantially relaxes requirements of systemd
   on the kernel version and the privileges it runs with (specifically,
   cgroupfs xattrs are not available in containers, since they are
   stored in kernel memory, and hence are unsafe to permit to lesser
   privileged code).

/run/systemd/units/ is a new directory, which contains a number of files
and symlinks encoding the above information. PID 1 creates and manages
these files, and journald reads them from there.

Note that this is supposed to be a direct path between PID 1 and the
journal only, due to the special runtime environment the journal runs
in. Normally, today we shouldn't introduce new interfaces that (mis-)use
a file system as IPC framework, and instead just an IPC system, but this
is very hard to do between the journal and PID 1, as long as the IPC
system is a subject PID 1 manages, and itself a client to the journal.

This patch cleans up a couple of types used in journal code:
specifically we switch to size_t for a couple of memory-sizing values,
as size_t is the right choice for everything that is memory.

Fixes: #4089
Fixes: #3041
Fixes: #4441

											
										
										
											2017-11-02 19:43:32 +01:00
+								        size_t l;
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
 								        assert(c);
-												tree-wide: make use of the fact that strv_free() returns NULL

Another Coccinelle patch.

											
										
										
											2015-09-09 23:05:10 +02:00
+								        c->environment = strv_free(c->environment);
 								        c->environment_files = strv_free(c->environment_files);
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
+								        c->pass_environment = strv_free(c->pass_environment);
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
+								        c->unset_environment = strv_free(c->unset_environment);
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
-												rlimit-util: add a common destructor call for arrays of struct rlimit

											
										
										
											2018-05-03 19:05:59 +02:00
+								        rlimit_free_all(c->rlimit);
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								        for (l = 0; l < 3; l++) {
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								                c->stdio_file[l] = mfree(c->stdio_file[l]);
 								        }
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
-												tree-wide: use coccinelle to patch a lot of code to use mfree()

This replaces this:

        free(p);
        p = NULL;

by this:

        p = mfree(p);

Change generated using coccinelle. Semantic patch is added to the
sources.

											
										
										
											2015-09-08 18:43:11 +02:00
+								        c->working_directory = mfree(c->working_directory);
 								        c->root_directory = mfree(c->root_directory);
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								        c->root_image = mfree(c->root_image);
-												tree-wide: use coccinelle to patch a lot of code to use mfree()

This replaces this:

        free(p);
        p = NULL;

by this:

        p = mfree(p);

Change generated using coccinelle. Semantic patch is added to the
sources.

											
										
										
											2015-09-08 18:43:11 +02:00
+								        c->tty_path = mfree(c->tty_path);
 								        c->syslog_identifier = mfree(c->syslog_identifier);
 								        c->user = mfree(c->user);
 								        c->group = mfree(c->group);
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												tree-wide: make use of the fact that strv_free() returns NULL

Another Coccinelle patch.

											
										
										
											2015-09-09 23:05:10 +02:00
+								        c->supplementary_groups = strv_free(c->supplementary_groups);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												tree-wide: use coccinelle to patch a lot of code to use mfree()

This replaces this:

        free(p);
        p = NULL;

by this:

        p = mfree(p);

Change generated using coccinelle. Semantic patch is added to the
sources.

											
										
										
											2015-09-08 18:43:11 +02:00
+								        c->pam_name = mfree(c->pam_name);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												doc,core: Read{Write,Only}Paths= and InaccessiblePaths=

This patch renames Read{Write,Only}Directories= and InaccessibleDirectories=
to Read{Write,Only}Paths= and InaccessiblePaths=, previous names are kept
as aliases but they are not advertised in the documentation.

Renamed variables:
`read_write_dirs` --> `read_write_paths`
`read_only_dirs` --> `read_only_paths`
`inaccessible_dirs` --> `inaccessible_paths`

											
										
										
											2016-07-07 11:17:00 +02:00
+								        c->read_only_paths = strv_free(c->read_only_paths);
 								        c->read_write_paths = strv_free(c->read_write_paths);
 								        c->inaccessible_paths = strv_free(c->inaccessible_paths);
-												dbus: complete exec coverage

											
										
										
											2010-07-04 16:44:58 +02:00
-												core: add ability to define arbitrary bind mounts for services

This adds two new settings BindPaths= and BindReadOnlyPaths=. They allow
defining arbitrary bind mounts specific to particular services. This is
particularly useful for services with RootDirectory= set as this permits making
specific bits of the host directory available to chrooted services.

The two new settings follow the concepts nspawn already possess in --bind= and
--bind-ro=, as well as the .nspawn settings Bind= and BindReadOnly= (and these
latter options should probably be renamed to BindPaths= and BindReadOnlyPaths=
too).

Fixes: #3439

											
										
										
											2016-11-23 22:21:40 +01:00
+								        bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
-												core/execute: clear bind_mounts

											
										
										
											2018-02-13 06:22:23 +01:00
+								        c->bind_mounts = NULL;
 								        c->n_bind_mounts = 0;
-												core: add new setting TemporaryFileSystem=

This introduces a new setting TemporaryFileSystem=. This is useful
to hide files not relevant to the processes invoked by unit, while
necessary files or directories can be still accessed by combining
with Bind{,ReadOnly}Paths=.

											
										
										
											2018-02-21 01:17:52 +01:00
+								        temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
 								        c->temporary_filesystems = NULL;
 								        c->n_temporary_filesystems = 0;
-												core: add ability to define arbitrary bind mounts for services

This adds two new settings BindPaths= and BindReadOnlyPaths=. They allow
defining arbitrary bind mounts specific to particular services. This is
particularly useful for services with RootDirectory= set as this permits making
specific bits of the host directory available to chrooted services.

The two new settings follow the concepts nspawn already possess in --bind= and
--bind-ro=, as well as the .nspawn settings Bind= and BindReadOnly= (and these
latter options should probably be renamed to BindPaths= and BindReadOnlyPaths=
too).

Fixes: #3439

											
										
										
											2016-11-23 22:21:40 +01:00
-												tree-wide: use cpu_set_mfree()

											
										
										
											2017-11-30 15:23:16 +01:00
+								        c->cpuset = cpu_set_mfree(c->cpuset);
-												execute,util: fix two small memory leaks

											
										
										
											2011-01-06 23:52:17 +01:00
-												tree-wide: use coccinelle to patch a lot of code to use mfree()

This replaces this:

        free(p);
        p = NULL;

by this:

        p = mfree(p);

Change generated using coccinelle. Semantic patch is added to the
sources.

											
										
										
											2015-09-08 18:43:11 +02:00
+								        c->utmp_id = mfree(c->utmp_id);
 								        c->selinux_context = mfree(c->selinux_context);
 								        c->apparmor_profile = mfree(c->apparmor_profile);
-												core: modify resource leak by SmackProcessLabel=

											
										
										
											2017-07-13 06:06:34 +02:00
+								        c->smack_process_label = mfree(c->smack_process_label);
-												core: Add AppArmor profile switching

This permit to switch to a specific apparmor profile when starting a daemon. This
will result in a non operation if apparmor is disabled.
It also add a new build requirement on libapparmor for using this feature.

											
										
										
											2014-02-20 16:19:44 +01:00
-												core: add support to specify errno in SystemCallFilter=

This makes each system call in SystemCallFilter= blacklist optionally
takes errno name or number after a colon. The errno takes precedence
over the one given by SystemCallErrorNumber=.

C.f. #7173.
Closes #7169.

											
										
										
											2017-11-11 13:35:49 +01:00
+								        c->syscall_filter = hashmap_free(c->syscall_filter);
-												tree-wide: take benefit of the fact that hashmap_free() returns NULL

And set_free() too.

Another Coccinelle patch.

											
										
										
											2015-09-09 23:12:07 +02:00
+								        c->syscall_archs = set_free(c->syscall_archs);
 								        c->address_families = set_free(c->address_families);
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								        for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                c->directories[i].paths = strv_free(c->directories[i].paths);
-												core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald

And let's make use of it to implement two new unit settings with it:

1. LogLevelMax= is a new per-unit setting that may be used to configure
   log priority filtering: set it to LogLevelMax=notice and only
   messages of level "notice" and lower (i.e. more important) will be
   processed, all others are dropped.

2. LogExtraFields= is a new per-unit setting for configuring per-unit
   journal fields, that are implicitly included in every log record
   generated by the unit's processes. It takes field/value pairs in the
   form of FOO=BAR.

Also, related to this, one exisiting unit setting is ported to this new
facility:

3. The invocation ID is now pulled from /run/systemd/units/ instead of
   cgroupfs xattrs. This substantially relaxes requirements of systemd
   on the kernel version and the privileges it runs with (specifically,
   cgroupfs xattrs are not available in containers, since they are
   stored in kernel memory, and hence are unsafe to permit to lesser
   privileged code).

/run/systemd/units/ is a new directory, which contains a number of files
and symlinks encoding the above information. PID 1 creates and manages
these files, and journald reads them from there.

Note that this is supposed to be a direct path between PID 1 and the
journal only, due to the special runtime environment the journal runs
in. Normally, today we shouldn't introduce new interfaces that (mis-)use
a file system as IPC framework, and instead just an IPC system, but this
is very hard to do between the journal and PID 1, as long as the IPC
system is a subject PID 1 manages, and itself a client to the journal.

This patch cleans up a couple of types used in journal code:
specifically we switch to size_t for a couple of memory-sizing values,
as size_t is the right choice for everything that is memory.

Fixes: #4089
Fixes: #3041
Fixes: #4441

											
										
										
											2017-11-02 19:43:32 +01:00
 								        c->log_level_max = -1;
 								        exec_context_free_log_extra_fields(c);
-												core: add two new unit file settings: StandardInputData= + StandardInputText=

Both permit configuring data to pass through STDIN to an invoked
process. StandardInputText= accepts a line of text (possibly with
embedded C-style escapes as well as unit specifiers), which is appended
to the buffer to pass as stdin, followed by a single newline.
StandardInputData= is similar, but accepts arbitrary base64 encoded
data, and will not resolve specifiers or C-style escapes, nor append
newlines.

This may be used to pass input/configuration data to services, directly
in-line from unit files, either in a cooked or in a more raw format.

											
										
										
											2017-10-27 11:33:05 +02:00
 								        c->stdin_data = mfree(c->stdin_data);
 								        c->stdin_data_size = 0;
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								        char **i;
 								        assert(c);
 								        if (!runtime_prefix)
 								                return 0;
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								                _cleanup_free_ char *p;
-												tree-wide: drop NULL sentinel from strjoin

This makes strjoin and strjoina more similar and avoids the useless final
argument.

spatch -I . -I ./src -I ./src/basic -I ./src/basic -I ./src/shared -I ./src/shared -I ./src/network -I ./src/locale -I ./src/login -I ./src/journal -I ./src/journal -I ./src/timedate -I ./src/timesync -I ./src/nspawn -I ./src/resolve -I ./src/resolve -I ./src/systemd -I ./src/core -I ./src/core -I ./src/libudev -I ./src/udev -I ./src/udev/net -I ./src/udev -I ./src/libsystemd/sd-bus -I ./src/libsystemd/sd-event -I ./src/libsystemd/sd-login -I ./src/libsystemd/sd-netlink -I ./src/libsystemd/sd-network -I ./src/libsystemd/sd-hwdb -I ./src/libsystemd/sd-device -I ./src/libsystemd/sd-id128 -I ./src/libsystemd-network --sp-file coccinelle/strjoin.cocci --in-place $(git ls-files src/*.c)

git grep -e '\bstrjoin\b.*NULL' -l|xargs sed -i -r 's/strjoin\((.*), NULL\)/strjoin(\1)/'

This might have missed a few cases (spatch has a really hard time dealing
with _cleanup_ macros), but that's no big issue, they can always be fixed
later.

											
										
										
											2016-10-23 17:43:27 +02:00
+								                p = strjoin(runtime_prefix, "/", *i);
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								                if (!p)
 								                        return -ENOMEM;
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                /* We execute this synchronously, since we need to be sure this is gone when we start the service
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								                 * next. */
-												util: rework rm_rf() logic

- Move to its own file rm-rf.c

- Change parameters into a single flags parameter

- Remove "honour sticky" logic, it's unused these days

											
										
										
											2015-04-04 11:52:57 +02:00
+								                (void) rm_rf(p, REMOVE_ROOT);
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								        }
 								        return 0;
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								static void exec_command_done(ExecCommand *c) {
-												execute: introduce exec_command_done() to free data from static ExecCommand structs

											
										
										
											2010-04-10 17:47:07 +02:00
+								        assert(c);
-												tree-wide: use coccinelle to patch a lot of code to use mfree()

This replaces this:

        free(p);
        p = NULL;

by this:

        p = mfree(p);

Change generated using coccinelle. Semantic patch is added to the
sources.

											
										
										
											2015-09-08 18:43:11 +02:00
+								        c->path = mfree(c->path);
-												tree-wide: make use of the fact that strv_free() returns NULL

Another Coccinelle patch.

											
										
										
											2015-09-09 23:05:10 +02:00
+								        c->argv = strv_free(c->argv);
-												execute: introduce exec_command_done() to free data from static ExecCommand structs

											
										
										
											2010-04-10 17:47:07 +02:00
+								}
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								void exec_command_done_array(ExecCommand *c, size_t n) {
 								        size_t i;
-												execute: introduce exec_command_done() to free data from static ExecCommand structs

											
										
										
											2010-04-10 17:47:07 +02:00
 								        for (i = 0; i < n; i++)
 								                exec_command_done(c+i);
 								}
-												core: make exec_command_free_list return NULL

											
										
										
											2014-12-18 18:29:24 +01:00
+								ExecCommand* exec_command_free_list(ExecCommand *c) {
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								        ExecCommand *i;
 								        while ((i = c)) {
-												list: make our list macros a bit easier to use by not requring type spec on each invocation

We can determine the list entry type via the typeof() gcc construct, and
so we should to make the macros much shorter to use.

											
										
										
											2013-10-14 06:10:14 +02:00
+								                LIST_REMOVE(command, c, i);
-												execute: introduce exec_command_done() to free data from static ExecCommand structs

											
										
										
											2010-04-10 17:47:07 +02:00
+								                exec_command_done(i);
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								                free(i);
 								        }
-												core: make exec_command_free_list return NULL

											
										
										
											2014-12-18 18:29:24 +01:00
 								        return NULL;
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								}
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								void exec_command_free_array(ExecCommand **c, size_t n) {
 								        size_t i;
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core: make exec_command_free_list return NULL

											
										
										
											2014-12-18 18:29:24 +01:00
+								        for (i = 0; i < n; i++)
 								                c[i] = exec_command_free_list(c[i]);
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								}
-												core: properly reset all ExecStatus structures when entering a new unit cycle

Whenever a unit is started fresh we should flush out any runtime data
from the previous cycle. We are pretty good at that already, but what so
far we missed was the ExecStart=/ExecStop=/… command exit status data.
Let's fix that, and properly flush out that stuff too.

Consider this service:

    [Service]
    ExecStart=/bin/sleep infinity
    ExecStop=/bin/false

When this service is started, then stopped and then started again
"systemctl status" would show the ExecStop= results of the previous run
along with the ExecStart= results of the current one, which is very
confusing. With this patch this is corrected: the data is kept right
until the moment the new service cycle starts, and then flushed out.
Hence "systemctl status" in that case will only show the ExecStart=
data, but no ExecStop= data, like it should be.

This should fix part of the confusion of #9588

											
										
										
											2018-07-17 19:36:46 +02:00
+								void exec_command_reset_status_array(ExecCommand *c, size_t n) {
 								        size_t i;
 								        for (i = 0; i < n; i++)
 								                exec_status_reset(&c[i].exec_status);
 								}
 								void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
 								        size_t i;
 								        for (i = 0; i < n; i++) {
 								                ExecCommand *z;
 								                LIST_FOREACH(command, z, c[i])
 								                        exec_status_reset(&z->exec_status);
 								        }
 								}
-												env-util: don't include files from src/core/

											
										
										
											2014-12-23 19:04:56 +01:00
+								typedef struct InvalidEnvInfo {
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								        const Unit *unit;
-												env-util: don't include files from src/core/

											
										
										
											2014-12-23 19:04:56 +01:00
+								        const char *path;
 								} InvalidEnvInfo;
 								static void invalid_env(const char *p, void *userdata) {
 								        InvalidEnvInfo *info = userdata;
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
-												env-util: don't include files from src/core/

											
										
										
											2014-12-23 19:04:56 +01:00
+								}
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								const char* exec_context_fdname(const ExecContext *c, int fd_index) {
 								        assert(c);
 								        switch (fd_index) {
-												core: fold property_get_input_fdname() and property_get_output_fdname() into one

property_get_output_fdname() already had two different control flows for
stdout and stderr, it might as well handle stdin too, thus shortening
our code a bit.

											
										
										
											2017-10-27 14:57:12 +02:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        case STDIN_FILENO:
 								                if (c->std_input != EXEC_INPUT_NAMED_FD)
 								                        return NULL;
-												core: fold property_get_input_fdname() and property_get_output_fdname() into one

property_get_output_fdname() already had two different control flows for
stdout and stderr, it might as well handle stdin too, thus shortening
our code a bit.

											
										
										
											2017-10-27 14:57:12 +02:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
-												core: fold property_get_input_fdname() and property_get_output_fdname() into one

property_get_output_fdname() already had two different control flows for
stdout and stderr, it might as well handle stdin too, thus shortening
our code a bit.

											
										
										
											2017-10-27 14:57:12 +02:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        case STDOUT_FILENO:
 								                if (c->std_output != EXEC_OUTPUT_NAMED_FD)
 								                        return NULL;
-												core: fold property_get_input_fdname() and property_get_output_fdname() into one

property_get_output_fdname() already had two different control flows for
stdout and stderr, it might as well handle stdin too, thus shortening
our code a bit.

											
										
										
											2017-10-27 14:57:12 +02:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
-												core: fold property_get_input_fdname() and property_get_output_fdname() into one

property_get_output_fdname() already had two different control flows for
stdout and stderr, it might as well handle stdin too, thus shortening
our code a bit.

											
										
										
											2017-10-27 14:57:12 +02:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        case STDERR_FILENO:
 								                if (c->std_error != EXEC_OUTPUT_NAMED_FD)
 								                        return NULL;
-												core: fold property_get_input_fdname() and property_get_output_fdname() into one

property_get_output_fdname() already had two different control flows for
stdout and stderr, it might as well handle stdin too, thus shortening
our code a bit.

											
										
										
											2017-10-27 14:57:12 +02:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
-												core: fold property_get_input_fdname() and property_get_output_fdname() into one

property_get_output_fdname() already had two different control flows for
stdout and stderr, it might as well handle stdin too, thus shortening
our code a bit.

											
										
										
											2017-10-27 14:57:12 +02:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        default:
 								                return NULL;
 								        }
 								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								        size_t i, targets;
-												core/execute: reformat exec_context_named_iofds() for legibility

											
										
										
											2017-01-31 17:23:10 +01:00
+								        const char* stdio_fdname[3];
-												tree-wide: be more careful with the type of array sizes

Previously we were a bit sloppy with the index and size types of arrays,
we'd regularly use unsigned. While I don't think this ever resulted in
real issues I think we should be more careful there and follow a
stricter regime: unless there's a strong reason not to use size_t for
array sizes and indexes, size_t it should be. Any allocations we do
ultimately will use size_t anyway, and converting forth and back between
unsigned and size_t will always be a source of problems.

Note that on 32bit machines "unsigned" and "size_t" are equivalent, and
on 64bit machines our arrays shouldn't grow that large anyway, and if
they do we have a problem, however that kind of overly large allocation
we have protections for usually, but for overflows we do not have that
so much, hence let's add it.

So yeah, it's a story of the current code being already "good enough",
but I think some extra type hygiene is better.

This patch tries to be comprehensive, but it probably isn't and I missed
a few cases. But I guess we can cover that later as we notice it. Among
smaller fixes, this changes:

1. strv_length()' return type becomes size_t

2. the unit file changes array size becomes size_t

3. DNS answer and query array sizes become size_t

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=76745

											
										
										
											2018-04-27 14:09:31 +02:00
+								        size_t n_fds;
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
 								        assert(c);
 								        assert(p);
 								        targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
 								                  (c->std_output == EXEC_OUTPUT_NAMED_FD) +
 								                  (c->std_error == EXEC_OUTPUT_NAMED_FD);
 								        for (i = 0; i < 3; i++)
 								                stdio_fdname[i] = exec_context_fdname(c, i);
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								        n_fds = p->n_storage_fds + p->n_socket_fds;
 								        for (i = 0; i < n_fds  && targets > 0; i++)
-												core/execute: reformat exec_context_named_iofds() for legibility

											
										
										
											2017-01-31 17:23:10 +01:00
+								                if (named_iofds[STDIN_FILENO] < 0 &&
 								                    c->std_input == EXEC_INPUT_NAMED_FD &&
 								                    stdio_fdname[STDIN_FILENO] &&
 								                    streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                        named_iofds[STDIN_FILENO] = p->fds[i];
 								                        targets--;
-												core/execute: reformat exec_context_named_iofds() for legibility

											
										
										
											2017-01-31 17:23:10 +01:00
 								                } else if (named_iofds[STDOUT_FILENO] < 0 &&
 								                           c->std_output == EXEC_OUTPUT_NAMED_FD &&
 								                           stdio_fdname[STDOUT_FILENO] &&
 								                           streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                        named_iofds[STDOUT_FILENO] = p->fds[i];
 								                        targets--;
-												core/execute: reformat exec_context_named_iofds() for legibility

											
										
										
											2017-01-31 17:23:10 +01:00
 								                } else if (named_iofds[STDERR_FILENO] < 0 &&
 								                           c->std_error == EXEC_OUTPUT_NAMED_FD &&
 								                           stdio_fdname[STDERR_FILENO] &&
 								                           streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                        named_iofds[STDERR_FILENO] = p->fds[i];
 								                        targets--;
 								                }
-												core/execute: reformat exec_context_named_iofds() for legibility

											
										
										
											2017-01-31 17:23:10 +01:00
+								        return targets == 0 ? 0 : -ENOENT;
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
+								        char **i, **r = NULL;
 								        assert(c);
 								        assert(l);
 								        STRV_FOREACH(i, c->environment_files) {
 								                char *fn;
-												core: fix warning about unsigned variable (#5935)

Fixup for d8c92e8bc7351f553936b5235e1922c18ebd817a.
											
										
										
											2017-05-11 08:15:28 +02:00
+								                int k;
 								                unsigned n;
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
+								                bool ignore = false;
 								                char **p;
-												move _cleanup_ attribute in front of the type

http://lists.freedesktop.org/archives/systemd-devel/2013-April/010510.html

											
										
										
											2013-04-18 09:11:22 +02:00
+								                _cleanup_globfree_ glob_t pglob = {};
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
 								                fn = *i;
 								                if (fn[0] == '-') {
 								                        ignore = true;
-												tree-wide: make ++/-- usage consistent WRT spacing

Throughout the tree there's spurious use of spaces separating ++ and --
operators from their respective operands.  Make ++ and -- operator
consistent with the majority of existing uses; discard the spaces.

											
										
										
											2016-02-23 05:32:04 +01:00
+								                        fn++;
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
+								                }
 								                if (!path_is_absolute(fn)) {
 								                        if (ignore)
 								                                continue;
 								                        strv_free(r);
 								                        return -EINVAL;
 								                }
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                /* Filename supports globbing, take all matching files */
-												execute: filter out "." for ".." in EnvironmentFile= globs too

This doesn't really matter much, only in case somebody would use
something strange like

  EnvironmentFile=/etc/something/.*

Make sure that "." and ".." is not returned by that glob. This makes
all our globbing patterns behave the same.

											
										
										
											2017-04-26 04:54:50 +02:00
+								                k = safe_glob(fn, 0, &pglob);
 								                if (k < 0) {
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                        if (ignore)
 								                                continue;
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                        strv_free(r);
-												execute: filter out "." for ".." in EnvironmentFile= globs too

This doesn't really matter much, only in case somebody would use
something strange like

  EnvironmentFile=/etc/something/.*

Make sure that "." and ".." is not returned by that glob. This makes
all our globbing patterns behave the same.

											
										
										
											2017-04-26 04:54:50 +02:00
+								                        return k;
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                }
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
-												execute: filter out "." for ".." in EnvironmentFile= globs too

This doesn't really matter much, only in case somebody would use
something strange like

  EnvironmentFile=/etc/something/.*

Make sure that "." and ".." is not returned by that glob. This makes
all our globbing patterns behave the same.

											
										
										
											2017-04-26 04:54:50 +02:00
+								                /* When we don't match anything, -ENOENT should be returned */
 								                assert(pglob.gl_pathc > 0);
 								                for (n = 0; n < pglob.gl_pathc; n++) {
-												machinectl: show /etc/os-release information of container in status output

											
										
										
											2014-07-03 17:50:55 +02:00
+								                        k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                        if (k < 0) {
 								                                if (ignore)
 								                                        continue;
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                                strv_free(r);
 								                                return k;
-												replace tabs with spaces in various files

The affected files in this patch had inconsistent use of tabs vs. spaces
for indentation, and this patch eliminates the stray tabs.

Also, the opening brace of sigchld_hdl() in activate.c was moved so the
opening braces are consistent throughout the file.

											
										
										
											2013-07-02 13:24:48 +02:00
+								                        }
-												core/execute: report invalid environment variables from files

Because "export key=val" is not supported by systemd, an error is logged
where the invalid assignment is coming from.

Introduce strv_env_clean_log() to log invalid environment assignments,
where logging is possible and allowed.

parse_env_file_internal() is modified to allow WHITESPACE in keys, to
report the issues later on.

											
										
										
											2013-04-17 15:25:02 +02:00
+								                        /* Log invalid environment variables with filename */
-												env-util: don't include files from src/core/

											
										
										
											2014-12-23 19:04:56 +01:00
+								                        if (p) {
 								                                InvalidEnvInfo info = {
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                                        .unit = unit,
-												env-util: don't include files from src/core/

											
										
										
											2014-12-23 19:04:56 +01:00
+								                                        .path = pglob.gl_pathv[n]
 								                                };
 								                                p = strv_env_clean_with_callback(p, invalid_env, &info);
 								                        }
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
-												tree-wide: drop a few == NULL and != NULL comparison

Our CODING_STYLE suggests not comparing with NULL, but relying on C's
downgrade-to-bool feature for that. Fix up some code to match these
guidelines. (This is not comprehensive, the coccinelle output for this
is unfortunately kinda borked)

											
										
										
											2017-12-08 20:52:38 +01:00
+								                        if (!r)
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                                r = p;
 								                        else {
 								                                char **m;
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                                m = strv_env_merge(2, r, p);
 								                                strv_free(r);
 								                                strv_free(p);
-												Add _cleanup_globfree_

Fixes a memleak in error path in exec_context_load_environment.

											
										
										
											2013-03-25 00:09:19 +01:00
+								                                if (!m)
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                                        return -ENOMEM;
 								                                r = m;
 								                        }
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
+								                }
 								        }
 								        *l = r;
 								        return 0;
 								}
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
+								static bool tty_may_match_dev_console(const char *tty) {
-												terminal-util: make resolve_dev_console() less weird

Let's normalize the behaviour: return a negative errno style error code,
and return the resolved string directly as argument.

											
										
										
											2018-02-14 17:30:37 +01:00
+								        _cleanup_free_ char *resolved = NULL;
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								        if (!tty)
 								                return true;
-												util-lib: add a new skip_dev_prefix() helper

This new helper removes a leading /dev if there is one. We have code
doing this all over the place, let's unify this, and correct it while
we are at it, by using path_startswith() rather than startswith() to
drop the prefix.

											
										
										
											2017-08-09 19:01:18 +02:00
+								        tty = skip_dev_prefix(tty);
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
 								        /* trivial identity? */
 								        if (streq(tty, "console"))
 								                return true;
-												terminal-util: make resolve_dev_console() less weird

Let's normalize the behaviour: return a negative errno style error code,
and return the resolved string directly as argument.

											
										
										
											2018-02-14 17:30:37 +01:00
+								        if (resolve_dev_console(&resolved) < 0)
 								                return true; /* if we could not resolve, assume it may */
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
 								        /* "tty0" means the active VC, so it may be the same sometimes */
-												terminal-util: make resolve_dev_console() less weird

Let's normalize the behaviour: return a negative errno style error code,
and return the resolved string directly as argument.

											
										
										
											2018-02-14 17:30:37 +01:00
+								        return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
+								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								bool exec_context_may_touch_console(const ExecContext *ec) {
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
 								        return (ec->tty_reset ||
 								                ec->tty_vhangup ||
 								                ec->tty_vt_disallocate ||
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
+								                is_terminal_input(ec->std_input) ||
 								                is_terminal_output(ec->std_output) ||
 								                is_terminal_output(ec->std_error)) &&
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								               tty_may_match_dev_console(exec_context_tty_path(ec));
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
+								}
-												execute: support basic filesystem namespacing

											
										
										
											2010-04-21 22:15:06 +02:00
+								static void strv_fprintf(FILE *f, char **l) {
 								        char **g;
 								        assert(f);
 								        STRV_FOREACH(g, l)
 								                fprintf(f, " %s", *g);
 								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
-												core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald

And let's make use of it to implement two new unit settings with it:

1. LogLevelMax= is a new per-unit setting that may be used to configure
   log priority filtering: set it to LogLevelMax=notice and only
   messages of level "notice" and lower (i.e. more important) will be
   processed, all others are dropped.

2. LogExtraFields= is a new per-unit setting for configuring per-unit
   journal fields, that are implicitly included in every log record
   generated by the unit's processes. It takes field/value pairs in the
   form of FOO=BAR.

Also, related to this, one exisiting unit setting is ported to this new
facility:

3. The invocation ID is now pulled from /run/systemd/units/ instead of
   cgroupfs xattrs. This substantially relaxes requirements of systemd
   on the kernel version and the privileges it runs with (specifically,
   cgroupfs xattrs are not available in containers, since they are
   stored in kernel memory, and hence are unsafe to permit to lesser
   privileged code).

/run/systemd/units/ is a new directory, which contains a number of files
and symlinks encoding the above information. PID 1 creates and manages
these files, and journald reads them from there.

Note that this is supposed to be a direct path between PID 1 and the
journal only, due to the special runtime environment the journal runs
in. Normally, today we shouldn't introduce new interfaces that (mis-)use
a file system as IPC framework, and instead just an IPC system, but this
is very hard to do between the journal and PID 1, as long as the IPC
system is a subject PID 1 manages, and itself a client to the journal.

This patch cleans up a couple of types used in journal code:
specifically we switch to size_t for a couple of memory-sizing values,
as size_t is the right choice for everything that is memory.

Fixes: #4089
Fixes: #3041
Fixes: #4441

											
										
										
											2017-11-02 19:43:32 +01:00
+								        ExecDirectoryType dt;
-												core: dump RuntimeDirectories and RuntimeDirectoryMode too

											
										
										
											2015-10-15 21:15:11 +02:00
+								        char **e, **d;
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        unsigned i;
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								        int r;
-												support chrooting/setting of ioprio when spawning

											
										
										
											2010-01-29 20:46:22 +01:00
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								        assert(c);
 								        assert(f);
-												core: general cgroup rework

Replace the very generic cgroup hookup with a much simpler one. With
this change only the high-level cgroup settings remain, the ability to
set arbitrary cgroup attributes is removed, so is support for adding
units to arbitrary cgroup controllers or setting arbitrary paths for
them (especially paths that are different for the various controllers).

This also introduces a new -.slice root slice, that is the parent of
system.slice and friends. This enables easy admin configuration of
root-level cgrouo properties.

This replaces DeviceDeny= by DevicePolicy=, and implicitly adds in
/dev/null, /dev/zero and friends if DeviceAllow= is used (unless this is
turned off by DevicePolicy=).

											
										
										
											2013-06-27 04:14:27 +02:00
+								        prefix = strempty(prefix);
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
 								        fprintf(f,
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								                "%sUMask: %04o\n"
 								                "%sWorkingDirectory: %s\n"
-												execute: allow configuration of O_NONBLOCK flag from .service files

											
										
										
											2010-02-12 02:00:18 +01:00
+								                "%sRootDirectory: %s\n"
-												execute: support basic filesystem namespacing

											
										
										
											2010-04-21 22:15:06 +02:00
+								                "%sNonBlocking: %s\n"
-												exec: add ControlGroupModify= switch to allow changing access mode to cgroups fs

											
										
										
											2011-06-30 00:11:25 +02:00
+								                "%sPrivateTmp: %s\n"
-												exec: introduce PrivateDevices= switch to provide services with a private /dev

Similar to PrivateNetwork=, PrivateTmp= introduce PrivateDevices= that
sets up a private /dev with only the API pseudo-devices like /dev/null,
/dev/zero, /dev/random, but not any physical devices in them.

											
										
										
											2014-01-20 19:54:51 +01:00
+								                "%sPrivateDevices: %s\n"
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                "%sProtectKernelTunables: %s\n"
-												core: make sure to dump ProtectKernelModules= value

											
										
										
											2016-10-09 12:31:51 +02:00
+								                "%sProtectKernelModules: %s\n"
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                "%sProtectControlGroups: %s\n"
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								                "%sPrivateNetwork: %s\n"
 								                "%sPrivateUsers: %s\n"
-												core: rename ReadOnlySystem= to ProtectSystem= and add a third value for also mounting /etc read-only

Also, rename ProtectedHome= to ProtectHome=, to simplify things a bit.

With this in place we now have two neat options ProtectSystem= and
ProtectHome= for protecting the OS itself (and optionally its
configuration), and for protecting the user's data.

											
										
										
											2014-06-04 18:07:55 +02:00
+								                "%sProtectHome: %s\n"
 								                "%sProtectSystem: %s\n"
-												core: add a per-unit setting MountAPIVFS= for mounting /dev, /proc, /sys in conjunction with RootDirectory=

This adds a boolean unit file setting MountAPIVFS=. If set, the three
main API VFS mounts will be mounted for the service. This only has an
effect on RootDirectory=, which it makes a ton times more useful.

(This is basically the /dev + /proc + /sys mounting code posted in the
original #4727, but rebased on current git, and with the automatic logic
replaced by explicit logic controlled by a unit file setting)

											
										
										
											2016-12-22 23:34:35 +01:00
+								                "%sMountAPIVFS: %s\n"
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
+								                "%sIgnoreSIGPIPE: %s\n"
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								                "%sMemoryDenyWriteExecute: %s\n"
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								                "%sRestrictRealtime: %s\n"
 								                "%sKeyringMode: %s\n",
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								                prefix, c->umask,
-												support chrooting/setting of ioprio when spawning

											
										
										
											2010-01-29 20:46:22 +01:00
+								                prefix, c->working_directory ? c->working_directory : "/",
-												execute: allow configuration of O_NONBLOCK flag from .service files

											
										
										
											2010-02-12 02:00:18 +01:00
+								                prefix, c->root_directory ? c->root_directory : "/",
-												execute: support basic filesystem namespacing

											
										
										
											2010-04-21 22:15:06 +02:00
+								                prefix, yes_no(c->non_blocking),
-												exec: add ControlGroupModify= switch to allow changing access mode to cgroups fs

											
										
										
											2011-06-30 00:11:25 +02:00
+								                prefix, yes_no(c->private_tmp),
-												exec: introduce PrivateDevices= switch to provide services with a private /dev

Similar to PrivateNetwork=, PrivateTmp= introduce PrivateDevices= that
sets up a private /dev with only the API pseudo-devices like /dev/null,
/dev/zero, /dev/random, but not any physical devices in them.

											
										
										
											2014-01-20 19:54:51 +01:00
+								                prefix, yes_no(c->private_devices),
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                prefix, yes_no(c->protect_kernel_tunables),
-												core: make sure to dump ProtectKernelModules= value

											
										
										
											2016-10-09 12:31:51 +02:00
+								                prefix, yes_no(c->protect_kernel_modules),
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                prefix, yes_no(c->protect_control_groups),
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								                prefix, yes_no(c->private_network),
 								                prefix, yes_no(c->private_users),
-												core: rename ReadOnlySystem= to ProtectSystem= and add a third value for also mounting /etc read-only

Also, rename ProtectedHome= to ProtectHome=, to simplify things a bit.

With this in place we now have two neat options ProtectSystem= and
ProtectHome= for protecting the OS itself (and optionally its
configuration), and for protecting the user's data.

											
										
										
											2014-06-04 18:07:55 +02:00
+								                prefix, protect_home_to_string(c->protect_home),
 								                prefix, protect_system_to_string(c->protect_system),
-												core: add a per-unit setting MountAPIVFS= for mounting /dev, /proc, /sys in conjunction with RootDirectory=

This adds a boolean unit file setting MountAPIVFS=. If set, the three
main API VFS mounts will be mounted for the service. This only has an
effect on RootDirectory=, which it makes a ton times more useful.

(This is basically the /dev + /proc + /sys mounting code posted in the
original #4727, but rebased on current git, and with the automatic logic
replaced by explicit logic controlled by a unit file setting)

											
										
										
											2016-12-22 23:34:35 +01:00
+								                prefix, yes_no(c->mount_apivfs),
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
+								                prefix, yes_no(c->ignore_sigpipe),
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								                prefix, yes_no(c->memory_deny_write_execute),
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								                prefix, yes_no(c->restrict_realtime),
 								                prefix, exec_keyring_mode_to_string(c->keyring_mode));
-												set nice/oom_adjust only when asked for

											
										
										
											2010-01-28 02:53:56 +01:00
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								        if (c->root_image)
 								                fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
+								        STRV_FOREACH(e, c->environment)
 								                fprintf(f, "%sEnvironment: %s\n", prefix, *e);
 								        STRV_FOREACH(e, c->environment_files)
 								                fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
+								        STRV_FOREACH(e, c->pass_environment)
 								                fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
+								        STRV_FOREACH(e, c->unset_environment)
 								                fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
-												core: allow preserving contents of RuntimeDirectory= over process restart

This introduces RuntimeDirectoryPreserve= option which takes a boolean
argument or 'restart'.

Closes #6087.

											
										
										
											2017-07-17 09:22:25 +02:00
+								        fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								        for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
 								                STRV_FOREACH(d, c->directories[dt].paths)
 								                        fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
 								        }
-												core: dump RuntimeDirectories and RuntimeDirectoryMode too

											
										
										
											2015-10-15 21:15:11 +02:00
-												set nice/oom_adjust only when asked for

											
										
										
											2010-01-28 02:53:56 +01:00
+								        if (c->nice_set)
 								                fprintf(f,
 								                        "%sNice: %i\n",
 								                        prefix, c->nice);
-												exec: replace OOMAdjust= by OOMScoreAdjust= to follow new kernel interface

This replaces OOMAdjust= by OOMScoreAdjust= in the config files,
breaking compatibility with older unit files. However, this keeps compat
with older kernels which lack the new OOM rework.

											
										
										
											2010-08-31 01:33:39 +02:00
+								        if (c->oom_score_adjust_set)
-												set nice/oom_adjust only when asked for

											
										
										
											2010-01-28 02:53:56 +01:00
+								                fprintf(f,
-												exec: replace OOMAdjust= by OOMScoreAdjust= to follow new kernel interface

This replaces OOMAdjust= by OOMScoreAdjust= in the config files,
breaking compatibility with older unit files. However, this keeps compat
with older kernels which lack the new OOM rework.

											
										
										
											2010-08-31 01:33:39 +02:00
+								                        "%sOOMScoreAdjust: %i\n",
 								                        prefix, c->oom_score_adjust);
-												support chrooting/setting of ioprio when spawning

											
										
										
											2010-01-29 20:46:22 +01:00
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        for (i = 0; i < RLIM_NLIMITS; i++)
-												core: dump soft limits too

											
										
										
											2015-11-28 18:15:03 +01:00
+								                if (c->rlimit[i]) {
-												core/execute: fix dump format for Limit*=

Fixes #9846.

											
										
										
											2018-08-10 06:03:02 +02:00
+								                        fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
-												core: dump soft limits too

											
										
										
											2015-11-28 18:15:03 +01:00
+								                                prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
-												core/execute: fix dump format for Limit*=

Fixes #9846.

											
										
										
											2018-08-10 06:03:02 +02:00
+								                        fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
-												core: dump soft limits too

											
										
										
											2015-11-28 18:15:03 +01:00
+								                                prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
 								                }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
+								        if (c->ioprio_set) {
-												execute: modernizations

											
										
										
											2014-02-19 17:49:00 +01:00
+								                _cleanup_free_ char *class_str = NULL;
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
-												core: do not ignore returned values

											
										
										
											2017-08-06 16:34:55 +02:00
+								                r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
 								                if (r >= 0)
 								                        fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
 								                fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
+								        if (c->cpu_sched_set) {
-												execute: modernizations

											
										
										
											2014-02-19 17:49:00 +01:00
+								                _cleanup_free_ char *policy_str = NULL;
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
-												core: do not ignore returned values

											
										
										
											2017-08-06 16:34:55 +02:00
+								                r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
 								                if (r >= 0)
 								                        fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								                fprintf(f,
-												execute: allow configuration of SCHED_RESET_ON_FORK

											
										
										
											2010-02-02 12:50:04 +01:00
+								                        "%sCPUSchedulingPriority: %i\n"
 								                        "%sCPUSchedulingResetOnFork: %s\n",
 								                        prefix, c->cpu_sched_priority,
 								                        prefix, yes_no(c->cpu_sched_reset_on_fork));
-												tabs to spaces

Skipped bootchart and various files that looked like they should be
kept in sync with external sources.

											
										
										
											2013-01-09 21:03:11 +01:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												dbus: complete exec coverage

											
										
										
											2010-07-04 16:44:58 +02:00
+								        if (c->cpuset) {
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								                fprintf(f, "%sCPUAffinity:", prefix);
-												dbus: complete exec coverage

											
										
										
											2010-07-04 16:44:58 +02:00
+								                for (i = 0; i < c->cpuset_ncpus; i++)
 								                        if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
-												build-sys: minor fixes found with cppcheck

											
										
										
											2013-12-25 19:00:12 +01:00
+								                                fprintf(f, " %u", i);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								                fputs("\n", f);
 								        }
-												time-util: add and use USEC/NSEC_INFINIY

											
										
										
											2014-07-29 12:23:31 +02:00
+								        if (c->timer_slack_nsec != NSEC_INFINITY)
-												Use format patterns for usec_t, pid_t, nsec_t, usec_t

It is nicer to predefine patterns using configure time check instead of
using casts everywhere.

Since we do not need to use any flags, include "%" in the format instead
of excluding it like PRI* macros.

											
										
										
											2013-12-30 23:22:26 +01:00
+								                fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
 								        fprintf(f,
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                "%sStandardInput: %s\n"
 								                "%sStandardOutput: %s\n"
 								                "%sStandardError: %s\n",
 								                prefix, exec_input_to_string(c->std_input),
 								                prefix, exec_output_to_string(c->std_output),
 								                prefix, exec_output_to_string(c->std_error));
-												core: add exec_context_dump() support for fd: and file: stdio settings

This was missing for using fdnames as stdio, let's add support for
fdnames as well as file paths in one go.

											
										
										
											2017-10-27 16:13:59 +02:00
+								        if (c->std_input == EXEC_INPUT_NAMED_FD)
 								                fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
 								        if (c->std_output == EXEC_OUTPUT_NAMED_FD)
 								                fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
 								        if (c->std_error == EXEC_OUTPUT_NAMED_FD)
 								                fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
 								        if (c->std_input == EXEC_INPUT_FILE)
 								                fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
 								        if (c->std_output == EXEC_OUTPUT_FILE)
 								                fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
-												Add support for opening files for appending

Addresses part of #8983

											
										
										
											2018-07-03 21:22:29 +02:00
+								        if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
 								                fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
-												core: add exec_context_dump() support for fd: and file: stdio settings

This was missing for using fdnames as stdio, let's add support for
fdnames as well as file paths in one go.

											
										
										
											2017-10-27 16:13:59 +02:00
+								        if (c->std_error == EXEC_OUTPUT_FILE)
 								                fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
-												Add support for opening files for appending

Addresses part of #8983

											
										
										
											2018-07-03 21:22:29 +02:00
+								        if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
 								                fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
-												core: add exec_context_dump() support for fd: and file: stdio settings

This was missing for using fdnames as stdio, let's add support for
fdnames as well as file paths in one go.

											
										
										
											2017-10-27 16:13:59 +02:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        if (c->tty_path)
 								                fprintf(f,
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
+								                        "%sTTYPath: %s\n"
 								                        "%sTTYReset: %s\n"
 								                        "%sTTYVHangup: %s\n"
 								                        "%sTTYVTDisallocate: %s\n",
 								                        prefix, c->tty_path,
 								                        prefix, yes_no(c->tty_reset),
 								                        prefix, yes_no(c->tty_vhangup),
 								                        prefix, yes_no(c->tty_vt_disallocate));
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												execute: make use of IN_SET() where we can

											
										
										
											2017-08-01 11:52:36 +02:00
+								        if (IN_SET(c->std_output,
 								                   EXEC_OUTPUT_SYSLOG,
 								                   EXEC_OUTPUT_KMSG,
 								                   EXEC_OUTPUT_JOURNAL,
 								                   EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 								                   EXEC_OUTPUT_KMSG_AND_CONSOLE,
 								                   EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
 								            IN_SET(c->std_error,
 								                   EXEC_OUTPUT_SYSLOG,
 								                   EXEC_OUTPUT_KMSG,
 								                   EXEC_OUTPUT_JOURNAL,
 								                   EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 								                   EXEC_OUTPUT_KMSG_AND_CONSOLE,
 								                   EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
-												Introduce cleanup functions for cap_free

Unfortunately a different cleanup function is necessary per type,
because cap_t** and char** are incompatible with void**.

											
										
										
											2014-01-01 04:35:54 +01:00
+								                _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
-												core: do not ignore returned values

											
										
										
											2017-08-06 16:34:55 +02:00
+								                r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
 								                if (r >= 0)
 								                        fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
-												core: do not ignore returned values

											
										
										
											2017-08-06 16:34:55 +02:00
+								                r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
 								                if (r >= 0)
 								                        fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald

And let's make use of it to implement two new unit settings with it:

1. LogLevelMax= is a new per-unit setting that may be used to configure
   log priority filtering: set it to LogLevelMax=notice and only
   messages of level "notice" and lower (i.e. more important) will be
   processed, all others are dropped.

2. LogExtraFields= is a new per-unit setting for configuring per-unit
   journal fields, that are implicitly included in every log record
   generated by the unit's processes. It takes field/value pairs in the
   form of FOO=BAR.

Also, related to this, one exisiting unit setting is ported to this new
facility:

3. The invocation ID is now pulled from /run/systemd/units/ instead of
   cgroupfs xattrs. This substantially relaxes requirements of systemd
   on the kernel version and the privileges it runs with (specifically,
   cgroupfs xattrs are not available in containers, since they are
   stored in kernel memory, and hence are unsafe to permit to lesser
   privileged code).

/run/systemd/units/ is a new directory, which contains a number of files
and symlinks encoding the above information. PID 1 creates and manages
these files, and journald reads them from there.

Note that this is supposed to be a direct path between PID 1 and the
journal only, due to the special runtime environment the journal runs
in. Normally, today we shouldn't introduce new interfaces that (mis-)use
a file system as IPC framework, and instead just an IPC system, but this
is very hard to do between the journal and PID 1, as long as the IPC
system is a subject PID 1 manages, and itself a client to the journal.

This patch cleans up a couple of types used in journal code:
specifically we switch to size_t for a couple of memory-sizing values,
as size_t is the right choice for everything that is memory.

Fixes: #4089
Fixes: #3041
Fixes: #4441

											
										
										
											2017-11-02 19:43:32 +01:00
+								        if (c->log_level_max >= 0) {
 								                _cleanup_free_ char *t = NULL;
 								                (void) log_level_to_string_alloc(c->log_level_max, &t);
 								                fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
 								        }
 								        if (c->n_log_extra_fields > 0) {
 								                size_t j;
 								                for (j = 0; j < c->n_log_extra_fields; j++) {
 								                        fprintf(f, "%sLogExtraFields: ", prefix);
 								                        fwrite(c->log_extra_fields[j].iov_base,
 , c->log_extra_fields[j].iov_len,
 								                               f);
 								                        fputc('\n', f);
 								                }
 								        }
-												securebits-util: add secure_bits_{from_string,to_string_alloc}()

											
										
										
											2017-08-07 16:40:25 +02:00
+								        if (c->secure_bits) {
 								                _cleanup_free_ char *str = NULL;
 								                r = secure_bits_to_string_alloc(c->secure_bits, &str);
 								                if (r >= 0)
 								                        fprintf(f, "%sSecure Bits: %s\n", prefix, str);
 								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												capabilities: keep bounding set in non-inverted format.

Change the capability bounding set parser and logic so that the bounding
set is kept as a positive set internally. This means that the set
reflects those capabilities that we want to keep instead of drop.

											
										
										
											2016-01-07 23:00:04 +01:00
+								        if (c->capability_bounding_set != CAP_ALL) {
-												cap-list: add capability_set_{from_string,to_string_alloc}()

											
										
										
											2017-08-07 16:25:11 +02:00
+								                _cleanup_free_ char *str = NULL;
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												cap-list: add capability_set_{from_string,to_string_alloc}()

											
										
										
											2017-08-07 16:25:11 +02:00
+								                r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
 								                if (r >= 0)
 								                        fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								        }
 								        if (c->capability_ambient_set != 0) {
-												cap-list: add capability_set_{from_string,to_string_alloc}()

											
										
										
											2017-08-07 16:25:11 +02:00
+								                _cleanup_free_ char *str = NULL;
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
-												cap-list: add capability_set_{from_string,to_string_alloc}()

											
										
										
											2017-08-07 16:25:11 +02:00
+								                r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
 								                if (r >= 0)
 								                        fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        }
 								        if (c->user)
-												execute: handle format strings in User= and other directives

											
										
										
											2010-06-18 23:25:19 +02:00
+								                fprintf(f, "%sUser: %s\n", prefix, c->user);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        if (c->group)
-												execute: handle format strings in User= and other directives

											
										
										
											2010-06-18 23:25:19 +02:00
+								                fprintf(f, "%sGroup: %s\n", prefix, c->group);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								        fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
-												core: use strv_isempty to check if supplementary_groups is empty

With the previous commit, we know that it will be NULL if empty, but
it's safe to always use strv_isempty() in case the code changes
in the future.

											
										
										
											2017-10-04 11:33:30 +02:00
+								        if (!strv_isempty(c->supplementary_groups)) {
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								                fprintf(f, "%sSupplementaryGroups:", prefix);
-												execute: support basic filesystem namespacing

											
										
										
											2010-04-21 22:15:06 +02:00
+								                strv_fprintf(f, c->supplementary_groups);
 								                fputs("\n", f);
 								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        if (c->pam_name)
-												execute: handle format strings in User= and other directives

											
										
										
											2010-06-18 23:25:19 +02:00
+								                fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												tree-wide: use !strv_isempty() instead of strv_length() > 0

											
										
										
											2017-12-15 08:36:35 +01:00
+								        if (!strv_isempty(c->read_write_paths)) {
-												doc,core: Read{Write,Only}Paths= and InaccessiblePaths=

This patch renames Read{Write,Only}Directories= and InaccessibleDirectories=
to Read{Write,Only}Paths= and InaccessiblePaths=, previous names are kept
as aliases but they are not advertised in the documentation.

Renamed variables:
`read_write_dirs` --> `read_write_paths`
`read_only_dirs` --> `read_only_paths`
`inaccessible_dirs` --> `inaccessible_paths`

											
										
										
											2016-07-07 11:17:00 +02:00
+								                fprintf(f, "%sReadWritePaths:", prefix);
 								                strv_fprintf(f, c->read_write_paths);
-												execute: support basic filesystem namespacing

											
										
										
											2010-04-21 22:15:06 +02:00
+								                fputs("\n", f);
 								        }
-												tree-wide: use !strv_isempty() instead of strv_length() > 0

											
										
										
											2017-12-15 08:36:35 +01:00
+								        if (!strv_isempty(c->read_only_paths)) {
-												doc,core: Read{Write,Only}Paths= and InaccessiblePaths=

This patch renames Read{Write,Only}Directories= and InaccessibleDirectories=
to Read{Write,Only}Paths= and InaccessiblePaths=, previous names are kept
as aliases but they are not advertised in the documentation.

Renamed variables:
`read_write_dirs` --> `read_write_paths`
`read_only_dirs` --> `read_only_paths`
`inaccessible_dirs` --> `inaccessible_paths`

											
										
										
											2016-07-07 11:17:00 +02:00
+								                fprintf(f, "%sReadOnlyPaths:", prefix);
 								                strv_fprintf(f, c->read_only_paths);
-												execute: support basic filesystem namespacing

											
										
										
											2010-04-21 22:15:06 +02:00
+								                fputs("\n", f);
 								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												tree-wide: use !strv_isempty() instead of strv_length() > 0

											
										
										
											2017-12-15 08:36:35 +01:00
+								        if (!strv_isempty(c->inaccessible_paths)) {
-												doc,core: Read{Write,Only}Paths= and InaccessiblePaths=

This patch renames Read{Write,Only}Directories= and InaccessibleDirectories=
to Read{Write,Only}Paths= and InaccessiblePaths=, previous names are kept
as aliases but they are not advertised in the documentation.

Renamed variables:
`read_write_dirs` --> `read_write_paths`
`read_only_dirs` --> `read_only_paths`
`inaccessible_dirs` --> `inaccessible_paths`

											
										
										
											2016-07-07 11:17:00 +02:00
+								                fprintf(f, "%sInaccessiblePaths:", prefix);
 								                strv_fprintf(f, c->inaccessible_paths);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								                fputs("\n", f);
 								        }
-												execute: add ability to configure the kill signal

											
										
										
											2010-07-10 04:49:37 +02:00
-												core: add ability to define arbitrary bind mounts for services

This adds two new settings BindPaths= and BindReadOnlyPaths=. They allow
defining arbitrary bind mounts specific to particular services. This is
particularly useful for services with RootDirectory= set as this permits making
specific bits of the host directory available to chrooted services.

The two new settings follow the concepts nspawn already possess in --bind= and
--bind-ro=, as well as the .nspawn settings Bind= and BindReadOnly= (and these
latter options should probably be renamed to BindPaths= and BindReadOnlyPaths=
too).

Fixes: #3439

											
										
										
											2016-11-23 22:21:40 +01:00
+								        if (c->n_bind_mounts > 0)
-												core/namespace: make '-' prefix in Bind{,ReadOnly}Paths= work

Each path in `Bind{ReadOnly}Paths=` accept '-' prefix. However,
the prefix is completely ignored.
This makes it work as expected.

											
										
										
											2018-02-21 01:07:56 +01:00
+								                for (i = 0; i < c->n_bind_mounts; i++)
 								                        fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
-												core: add ability to define arbitrary bind mounts for services

This adds two new settings BindPaths= and BindReadOnlyPaths=. They allow
defining arbitrary bind mounts specific to particular services. This is
particularly useful for services with RootDirectory= set as this permits making
specific bits of the host directory available to chrooted services.

The two new settings follow the concepts nspawn already possess in --bind= and
--bind-ro=, as well as the .nspawn settings Bind= and BindReadOnly= (and these
latter options should probably be renamed to BindPaths= and BindReadOnlyPaths=
too).

Fixes: #3439

											
										
										
											2016-11-23 22:21:40 +01:00
+								                                c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
-												core/namespace: make '-' prefix in Bind{,ReadOnly}Paths= work

Each path in `Bind{ReadOnly}Paths=` accept '-' prefix. However,
the prefix is completely ignored.
This makes it work as expected.

											
										
										
											2018-02-21 01:07:56 +01:00
+								                                c->bind_mounts[i].ignore_enoent ? "-": "",
-												core: add ability to define arbitrary bind mounts for services

This adds two new settings BindPaths= and BindReadOnlyPaths=. They allow
defining arbitrary bind mounts specific to particular services. This is
particularly useful for services with RootDirectory= set as this permits making
specific bits of the host directory available to chrooted services.

The two new settings follow the concepts nspawn already possess in --bind= and
--bind-ro=, as well as the .nspawn settings Bind= and BindReadOnly= (and these
latter options should probably be renamed to BindPaths= and BindReadOnlyPaths=
too).

Fixes: #3439

											
										
										
											2016-11-23 22:21:40 +01:00
+								                                c->bind_mounts[i].source,
 								                                c->bind_mounts[i].destination,
 								                                c->bind_mounts[i].recursive ? "rbind" : "norbind");
-												core: add new setting TemporaryFileSystem=

This introduces a new setting TemporaryFileSystem=. This is useful
to hide files not relevant to the processes invoked by unit, while
necessary files or directories can be still accessed by combining
with Bind{,ReadOnly}Paths=.

											
										
										
											2018-02-21 01:17:52 +01:00
+								        if (c->n_temporary_filesystems > 0)
 								                for (i = 0; i < c->n_temporary_filesystems; i++) {
 								                        TemporaryFileSystem *t = c->temporary_filesystems + i;
 								                        fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
 								                                t->path,
 								                                isempty(t->options) ? "" : ":",
 								                                strempty(t->options));
 								                }
-												service: optionally, create INIT_PROCESS/DEAD_PROCESS entries for a service

This should fix accounting for pam_limits and suchlike.

https://bugzilla.redhat.com/show_bug.cgi?id=636036

											
										
										
											2010-10-08 16:06:23 +02:00
+								        if (c->utmp_id)
 								                fprintf(f,
 								                        "%sUtmpIdentifier: %s\n",
 								                        prefix, c->utmp_id);
-												exec: Add SELinuxContext configuration item

This permit to let system administrators decide of the domain of a service.
This can be used with templated units to have each service in a différent
domain ( for example, a per customer database, using MLS or anything ),
or can be used to force a non selinux enabled system (jvm, erlang, etc)
to start in a different domain for each service.

											
										
										
											2014-02-06 10:05:16 +01:00
 								        if (c->selinux_context)
 								                fprintf(f,
-												core: store and expose SELinuxContext field normalized as bool + string

											
										
										
											2014-02-17 16:52:52 +01:00
+								                        "%sSELinuxContext: %s%s\n",
 								                        prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
-												core: dump also missed security context

											
										
										
											2017-07-13 06:10:41 +02:00
+								        if (c->apparmor_profile)
 								                fprintf(f,
 								                        "%sAppArmorProfile: %s%s\n",
 								                        prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
 								        if (c->smack_process_label)
 								                fprintf(f,
 								                        "%sSmackProcessLabel: %s%s\n",
 								                        prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
-												util: introduce PERSONALITY_INVALID as macro for 0xffffffffLU

											
										
										
											2015-05-21 19:48:49 +02:00
+								        if (c->personality != PERSONALITY_INVALID)
-												core: add Personality= option for units to set the personality for spawned processes

											
										
										
											2014-02-19 02:15:24 +01:00
+								                fprintf(f,
 								                        "%sPersonality: %s\n",
 								                        prefix, strna(personality_to_string(c->personality)));
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
+								        fprintf(f,
 								                "%sLockPersonality: %s\n",
 								                prefix, yes_no(c->lock_personality));
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								        if (c->syscall_filter) {
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								                Iterator j;
-												core: add support to specify errno in SystemCallFilter=

This makes each system call in SystemCallFilter= blacklist optionally
takes errno name or number after a colon. The errno takes precedence
over the one given by SystemCallErrorNumber=.

C.f. #7173.
Closes #7169.

											
										
										
											2017-11-11 13:35:49 +01:00
+								                void *id, *val;
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								                bool first = true;
-												core: fix build without libseccomp

											
										
										
											2014-02-12 18:44:40 +01:00
+								#endif
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
 								                fprintf(f,
-												core: add SystemCallArchitectures= unit setting to allow disabling of non-native
architecture support for system calls

Also, turn system call filter bus properties into complex types instead
of concatenated strings.

											
										
										
											2014-02-13 00:24:00 +01:00
+								                        "%sSystemCallFilter: ",
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								                        prefix);
 								                if (!c->syscall_whitelist)
 								                        fputc('~', f);
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												core: add support to specify errno in SystemCallFilter=

This makes each system call in SystemCallFilter= blacklist optionally
takes errno name or number after a colon. The errno takes precedence
over the one given by SystemCallErrorNumber=.

C.f. #7173.
Closes #7169.

											
										
										
											2017-11-11 13:35:49 +01:00
+								                HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								                        _cleanup_free_ char *name = NULL;
-												core: add support to specify errno in SystemCallFilter=

This makes each system call in SystemCallFilter= blacklist optionally
takes errno name or number after a colon. The errno takes precedence
over the one given by SystemCallErrorNumber=.

C.f. #7173.
Closes #7169.

											
										
										
											2017-11-11 13:35:49 +01:00
+								                        const char *errno_name = NULL;
 								                        int num = PTR_TO_INT(val);
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
 								                        if (first)
 								                                first = false;
 								                        else
 								                                fputc(' ', f);
-												core: add SystemCallArchitectures= unit setting to allow disabling of non-native
architecture support for system calls

Also, turn system call filter bus properties into complex types instead
of concatenated strings.

											
										
										
											2014-02-13 00:24:00 +01:00
+								                        name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								                        fputs(strna(name), f);
-												core: add support to specify errno in SystemCallFilter=

This makes each system call in SystemCallFilter= blacklist optionally
takes errno name or number after a colon. The errno takes precedence
over the one given by SystemCallErrorNumber=.

C.f. #7173.
Closes #7169.

											
										
										
											2017-11-11 13:35:49 +01:00
 								                        if (num >= 0) {
 								                                errno_name = errno_to_name(num);
 								                                if (errno_name)
 								                                        fprintf(f, ":%s", errno_name);
 								                                else
 								                                        fprintf(f, ":%d", num);
 								                        }
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								                }
-												core: fix build without libseccomp

											
										
										
											2014-02-12 18:44:40 +01:00
+								#endif
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
 								                fputc('\n', f);
 								        }
-												core: add SystemCallArchitectures= unit setting to allow disabling of non-native
architecture support for system calls

Also, turn system call filter bus properties into complex types instead
of concatenated strings.

											
										
										
											2014-02-13 00:24:00 +01:00
+								        if (c->syscall_archs) {
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												core: add SystemCallArchitectures= unit setting to allow disabling of non-native
architecture support for system calls

Also, turn system call filter bus properties into complex types instead
of concatenated strings.

											
										
										
											2014-02-13 00:24:00 +01:00
+								                Iterator j;
 								                void *id;
 								#endif
 								                fprintf(f,
 								                        "%sSystemCallArchitectures:",
 								                        prefix);
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												core: add SystemCallArchitectures= unit setting to allow disabling of non-native
architecture support for system calls

Also, turn system call filter bus properties into complex types instead
of concatenated strings.

											
										
										
											2014-02-13 00:24:00 +01:00
+								                SET_FOREACH(id, c->syscall_archs, j)
 								                        fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
 								#endif
 								                fputc('\n', f);
 								        }
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								        if (exec_context_restrict_namespaces_set(c)) {
 								                _cleanup_free_ char *s = NULL;
-												nsflsgs: drop namespace_flag_{from,to}_string()

This also drops namespace_flag_to_string_many_with_check(), and
renames namespace_flag_{from,to}_string_many() to
namespace_flags_{from,to}_string().

											
										
										
											2018-05-01 03:48:21 +02:00
+								                r = namespace_flags_to_string(c->restrict_namespaces, &s);
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								                if (r >= 0)
 								                        fprintf(f, "%sRestrictNamespaces: %s\n",
 								                                prefix, s);
 								        }
-												core: allow to specify errno number in SystemCallErrorNumber=

											
										
										
											2017-11-11 13:40:20 +01:00
+								        if (c->syscall_errno > 0) {
 								                const char *errno_name;
 								                fprintf(f, "%sSystemCallErrorNumber: ", prefix);
 								                errno_name = errno_to_name(c->syscall_errno);
 								                if (errno_name)
 								                        fprintf(f, "%s\n", errno_name);
 								                else
 								                        fprintf(f, "%d\n", c->syscall_errno);
 								        }
-												core: Add AppArmor profile switching

This permit to switch to a specific apparmor profile when starting a daemon. This
will result in a non operation if apparmor is disabled.
It also add a new build requirement on libapparmor for using this feature.

											
										
										
											2014-02-20 16:19:44 +01:00
 								        if (c->apparmor_profile)
 								                fprintf(f,
 								                        "%sAppArmorProfile: %s%s\n",
 								                        prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								bool exec_context_maintains_privileges(const ExecContext *c) {
-												core: introduce new Delegate=yes/no property controlling creation of cgroup subhierarchies

For priviliged units this resource control property ensures that the
processes have all controllers systemd manages enabled.

For unpriviliged services (those with User= set) this ensures that
access rights to the service cgroup is granted to the user in question,
to create further subgroups. Note that this only applies to the
name=systemd hierarchy though, as access to other controllers is not
safe for unpriviliged processes.

Delegate=yes should be set for container scopes where a systemd instance
inside the container shall manage the hierarchies below its own cgroup
and have access to all controllers.

Delegate=yes should also be set for user@.service, so that systemd
--user can run, controlling its own cgroup tree.

This commit changes machined, systemd-nspawn@.service and user@.service
to set this boolean, in order to ensure that container management will
just work, and the user systemd instance can run fine.

											
										
										
											2014-11-05 17:57:23 +01:00
+								        assert(c);
-												treewide: fix typos and remove accidental repetition of words

											
										
										
											2016-07-10 14:48:23 +02:00
+								        /* Returns true if the process forked off would run under
-												core: introduce new Delegate=yes/no property controlling creation of cgroup subhierarchies

For priviliged units this resource control property ensures that the
processes have all controllers systemd manages enabled.

For unpriviliged services (those with User= set) this ensures that
access rights to the service cgroup is granted to the user in question,
to create further subgroups. Note that this only applies to the
name=systemd hierarchy though, as access to other controllers is not
safe for unpriviliged processes.

Delegate=yes should be set for container scopes where a systemd instance
inside the container shall manage the hierarchies below its own cgroup
and have access to all controllers.

Delegate=yes should also be set for user@.service, so that systemd
--user can run, controlling its own cgroup tree.

This commit changes machined, systemd-nspawn@.service and user@.service
to set this boolean, in order to ensure that container management will
just work, and the user systemd instance can run fine.

											
										
										
											2014-11-05 17:57:23 +01:00
+								         * an unchanged UID or as root. */
 								        if (!c->user)
 								                return true;
 								        if (streq(c->user, "root") || streq(c->user, "0"))
 								                return true;
 								        return false;
 								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								int exec_context_get_effective_ioprio(const ExecContext *c) {
-												core: make IOSchedulingClass= and IOSchedulingPriority= settable for transient units

This patch is a bit more complex thant I hoped. In particular the single
IOScheduling= property exposed on the bus is split up into
IOSchedulingClass= and IOSchedulingPriority= (though compat is
retained). Otherwise the asymmetry between setting props and getting
them is a bit too nasty.

Fixes #5613

											
										
										
											2017-06-26 17:40:08 +02:00
+								        int p;
 								        assert(c);
 								        if (c->ioprio_set)
 								                return c->ioprio;
 								        p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
 								        if (p < 0)
 								                return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
 								        return p;
 								}
-												core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald

And let's make use of it to implement two new unit settings with it:

1. LogLevelMax= is a new per-unit setting that may be used to configure
   log priority filtering: set it to LogLevelMax=notice and only
   messages of level "notice" and lower (i.e. more important) will be
   processed, all others are dropped.

2. LogExtraFields= is a new per-unit setting for configuring per-unit
   journal fields, that are implicitly included in every log record
   generated by the unit's processes. It takes field/value pairs in the
   form of FOO=BAR.

Also, related to this, one exisiting unit setting is ported to this new
facility:

3. The invocation ID is now pulled from /run/systemd/units/ instead of
   cgroupfs xattrs. This substantially relaxes requirements of systemd
   on the kernel version and the privileges it runs with (specifically,
   cgroupfs xattrs are not available in containers, since they are
   stored in kernel memory, and hence are unsafe to permit to lesser
   privileged code).

/run/systemd/units/ is a new directory, which contains a number of files
and symlinks encoding the above information. PID 1 creates and manages
these files, and journald reads them from there.

Note that this is supposed to be a direct path between PID 1 and the
journal only, due to the special runtime environment the journal runs
in. Normally, today we shouldn't introduce new interfaces that (mis-)use
a file system as IPC framework, and instead just an IPC system, but this
is very hard to do between the journal and PID 1, as long as the IPC
system is a subject PID 1 manages, and itself a client to the journal.

This patch cleans up a couple of types used in journal code:
specifically we switch to size_t for a couple of memory-sizing values,
as size_t is the right choice for everything that is memory.

Fixes: #4089
Fixes: #3041
Fixes: #4441

											
										
										
											2017-11-02 19:43:32 +01:00
+								void exec_context_free_log_extra_fields(ExecContext *c) {
 								        size_t l;
 								        assert(c);
 								        for (l = 0; l < c->n_log_extra_fields; l++)
 								                free(c->log_extra_fields[l].iov_base);
 								        c->log_extra_fields = mfree(c->log_extra_fields);
 								        c->n_log_extra_fields = 0;
 								}
-												dbus: complete exec status coverage

											
										
										
											2010-07-04 18:49:58 +02:00
+								void exec_status_start(ExecStatus *s, pid_t pid) {
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        assert(s);
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
-												execute: use structure initialization when filling in exec status

											
										
										
											2018-07-17 16:00:21 +02:00
+								        *s = (ExecStatus) {
 								                .pid = pid,
 								        };
-												dbus: complete exec status coverage

											
										
										
											2010-07-04 18:49:58 +02:00
+								        dual_timestamp_get(&s->start_timestamp);
 								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
-												dbus: complete exec status coverage

											
										
										
											2010-07-04 18:49:58 +02:00
+								        assert(s);
-												execute: use structure initialization when filling in exec status

											
										
										
											2018-07-17 16:00:21 +02:00
+								        if (s->pid != pid) {
 								                *s = (ExecStatus) {
 								                        .pid = pid,
 								                };
 								        }
-												dbus: complete exec status coverage

											
										
										
											2010-07-04 18:49:58 +02:00
-												core: rename struct timestamp to dual_timestamp to avoid name clash with IP system headers

											
										
										
											2010-07-01 00:26:44 +02:00
+								        dual_timestamp_get(&s->exit_timestamp);
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        s->code = code;
 								        s->status = status;
-												service: optionally, create INIT_PROCESS/DEAD_PROCESS entries for a service

This should fix accounting for pam_limits and suchlike.

https://bugzilla.redhat.com/show_bug.cgi?id=636036

											
										
										
											2010-10-08 16:06:23 +02:00
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
+								        if (context) {
 								                if (context->utmp_id)
-												execute: use structure initialization when filling in exec status

											
										
										
											2018-07-17 16:00:21 +02:00
+								                        (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                exec_context_tty_reset(context, NULL);
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
+								        }
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								}
-												core: properly reset all ExecStatus structures when entering a new unit cycle

Whenever a unit is started fresh we should flush out any runtime data
from the previous cycle. We are pretty good at that already, but what so
far we missed was the ExecStart=/ExecStop=/… command exit status data.
Let's fix that, and properly flush out that stuff too.

Consider this service:

    [Service]
    ExecStart=/bin/sleep infinity
    ExecStop=/bin/false

When this service is started, then stopped and then started again
"systemctl status" would show the ExecStop= results of the previous run
along with the ExecStart= results of the current one, which is very
confusing. With this patch this is corrected: the data is kept right
until the moment the new service cycle starts, and then flushed out.
Hence "systemctl status" in that case will only show the ExecStart=
data, but no ExecStop= data, like it should be.

This should fix part of the confusion of #9588

											
										
										
											2018-07-17 19:36:46 +02:00
+								void exec_status_reset(ExecStatus *s) {
 								        assert(s);
 								        *s = (ExecStatus) {};
 								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								        char buf[FORMAT_TIMESTAMP_MAX];
 								        assert(s);
 								        assert(f);
 								        if (s->pid <= 0)
 								                return;
-												core: unify how we generate the prefix string when dumping unit state

											
										
										
											2014-08-21 16:15:49 +02:00
+								        prefix = strempty(prefix);
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								        fprintf(f,
-												Use format patterns for usec_t, pid_t, nsec_t, usec_t

It is nicer to predefine patterns using configure time check instead of
using casts everywhere.

Since we do not need to use any flags, include "%" in the format instead
of excluding it like PRI* macros.

											
										
										
											2013-12-30 23:22:26 +01:00
+								                "%sPID: "PID_FMT"\n",
 								                prefix, s->pid);
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
-												core: use the correct APIs to determine whether a dual timestamp is initialized

											
										
										
											2016-07-27 11:50:37 +02:00
+								        if (dual_timestamp_is_set(&s->start_timestamp))
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								                fprintf(f,
 								                        "%sStart Timestamp: %s\n",
-												core: rename struct timestamp to dual_timestamp to avoid name clash with IP system headers

											
										
										
											2010-07-01 00:26:44 +02:00
+								                        prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
-												core: use the correct APIs to determine whether a dual timestamp is initialized

											
										
										
											2016-07-27 11:50:37 +02:00
+								        if (dual_timestamp_is_set(&s->exit_timestamp))
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								                fprintf(f,
 								                        "%sExit Timestamp: %s\n"
 								                        "%sExit Code: %s\n"
 								                        "%sExit Status: %i\n",
-												core: rename struct timestamp to dual_timestamp to avoid name clash with IP system headers

											
										
										
											2010-07-01 00:26:44 +02:00
+								                        prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								                        prefix, sigchld_code_to_string(s->code),
 								                        prefix, s->status);
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								}
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								static char *exec_command_line(char **argv) {
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
+								        size_t k;
 								        char *n, *p, **a;
 								        bool first = true;
-												core: add minimal templating system

											
										
										
											2010-04-15 03:11:11 +02:00
+								        assert(argv);
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
-												properly terminate strings with NUL byte

											
										
										
											2010-01-27 02:15:54 +01:00
+								        k = 1;
-												core: add minimal templating system

											
										
										
											2010-04-15 03:11:11 +02:00
+								        STRV_FOREACH(a, argv)
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
+								                k += strlen(*a)+3;
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								        n = new(char, k);
 								        if (!n)
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
+								                return NULL;
 								        p = n;
-												core: add minimal templating system

											
										
										
											2010-04-15 03:11:11 +02:00
+								        STRV_FOREACH(a, argv) {
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
 								                if (!first)
 								                        *(p++) = ' ';
 								                else
 								                        first = false;
 								                if (strpbrk(*a, WHITESPACE)) {
 								                        *(p++) = '\'';
 								                        p = stpcpy(p, *a);
 								                        *(p++) = '\'';
 								                } else
 								                        p = stpcpy(p, *a);
 								        }
-												properly terminate strings with NUL byte

											
										
										
											2010-01-27 02:15:54 +01:00
+								        *p = 0;
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
+								        /* FIXME: this doesn't really handle arguments that have
 								         * spaces and ticks in them */
 								        return n;
 								}
-												core/execute: make arguments constant if possible

Also make functions static if possible.

											
										
										
											2018-02-06 04:17:50 +01:00
+								static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
-												use more _cleanup_ macro

											
										
										
											2014-06-24 19:00:32 +02:00
+								        _cleanup_free_ char *cmd = NULL;
-												core: unify how we generate the prefix string when dumping unit state

											
										
										
											2014-08-21 16:15:49 +02:00
+								        const char *prefix2;
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
 								        assert(c);
 								        assert(f);
-												core: unify how we generate the prefix string when dumping unit state

											
										
										
											2014-08-21 16:15:49 +02:00
+								        prefix = strempty(prefix);
-												util: rework strappenda(), and rename it strjoina()

After all it is now much more like strjoin() than strappend(). At the
same time, add support for NULL sentinels, even if they are normally not
necessary.

											
										
										
											2015-02-03 02:05:59 +01:00
+								        prefix2 = strjoina(prefix, "\t");
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
-												core: add minimal templating system

											
										
										
											2010-04-15 03:11:11 +02:00
+								        cmd = exec_command_line(c->argv);
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
+								        fprintf(f,
 								                "%sCommand Line: %s\n",
 								                prefix, cmd ? cmd : strerror(ENOMEM));
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								        exec_status_dump(&c->exec_status, f, prefix2);
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
+								}
 								void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
 								        assert(f);
-												core: unify how we generate the prefix string when dumping unit state

											
										
										
											2014-08-21 16:15:49 +02:00
+								        prefix = strempty(prefix);
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
 								        LIST_FOREACH(command, c, c)
 								                exec_command_dump(c, f, prefix);
 								}
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												execute: simplify appending to execution list

											
										
										
											2010-02-14 01:05:55 +01:00
+								void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
 								        ExecCommand *end;
 								        assert(l);
 								        assert(e);
 								        if (*l) {
-												Spelling Corrections

Just some lame spelling corrections with no functionality.

											
										
										
											2011-02-21 15:32:17 +01:00
+								                /* It's kind of important, that we keep the order here */
-												list: make our list macros a bit easier to use by not requring type spec on each invocation

We can determine the list entry type via the typeof() gcc construct, and
so we should to make the macros much shorter to use.

											
										
										
											2013-10-14 06:10:14 +02:00
+								                LIST_FIND_TAIL(command, *l, end);
 								                LIST_INSERT_AFTER(command, *l, end, e);
-												execute: simplify appending to execution list

											
										
										
											2010-02-14 01:05:55 +01:00
+								        } else
 								              *l = e;
 								}
-												execute: introduce exec_command_set() for easy setting for command lines

											
										
										
											2010-04-10 17:46:41 +02:00
+								int exec_command_set(ExecCommand *c, const char *path, ...) {
 								        va_list ap;
 								        char **l, *p;
 								        assert(c);
 								        assert(path);
 								        va_start(ap, path);
 								        l = strv_new_ap(path, ap);
 								        va_end(ap);
 								        if (!l)
 								                return -ENOMEM;
-												strv: introduce new strv_from_stdarg_alloca() macro to generate a string array from stdarg function parameters

This allows us to turn lists of strings passed in easily into string
arrays without having to allocate memory.

											
										
										
											2013-10-29 19:53:43 +01:00
+								        p = strdup(path);
 								        if (!p) {
-												execute: introduce exec_command_set() for easy setting for command lines

											
										
										
											2010-04-10 17:46:41 +02:00
+								                strv_free(l);
 								                return -ENOMEM;
 								        }
 								        free(c->path);
 								        c->path = p;
-												tree-wide: use strv_free_and_replace() macro

											
										
										
											2018-05-09 17:34:46 +02:00
+								        return strv_free_and_replace(c->argv, l);
-												execute: introduce exec_command_set() for easy setting for command lines

											
										
										
											2010-04-10 17:46:41 +02:00
+								}
-												swap: introduce Discard property

Process possible "discard" values from /etc/fstab.

											
										
										
											2014-09-24 14:29:05 +02:00
+								int exec_command_append(ExecCommand *c, const char *path, ...) {
-												core: execute - don't leak strv

											
										
										
											2014-09-30 11:34:01 +02:00
+								        _cleanup_strv_free_ char **l = NULL;
-												swap: introduce Discard property

Process possible "discard" values from /etc/fstab.

											
										
										
											2014-09-24 14:29:05 +02:00
+								        va_list ap;
 								        int r;
 								        assert(c);
 								        assert(path);
 								        va_start(ap, path);
 								        l = strv_new_ap(path, ap);
 								        va_end(ap);
 								        if (!l)
 								                return -ENOMEM;
-												ask-password: add support for caching passwords in the kernel keyring

This adds support for caching harddisk passwords in the kernel keyring
if it is available, thus supporting caching without Plymouth being
around.

This is also useful for hooking up "gdm-auto-login" with the collected
boot-time harddisk password, in order to support gnome keyring
passphrase unlocking via the HDD password, if it is the same.

Any passwords added to the kernel keyring this way have a timeout of
2.5min at which time they are purged from the kernel.

											
										
										
											2015-10-07 11:26:10 +02:00
+								        r = strv_extend_strv(&c->argv, l, false);
-												core: execute - don't leak strv

											
										
										
											2014-09-30 11:34:01 +02:00
+								        if (r < 0)
-												swap: introduce Discard property

Process possible "discard" values from /etc/fstab.

											
										
										
											2014-09-24 14:29:05 +02:00
+								                return r;
 								        return 0;
 								}
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								static void *remove_tmpdir_thread(void *p) {
 								        _cleanup_free_ char *path = p;
-												swap: introduce Discard property

Process possible "discard" values from /etc/fstab.

											
										
										
											2014-09-24 14:29:05 +02:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
 								        return NULL;
 								}
 								static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
 								        int r;
 								        if (!rt)
 								                return NULL;
 								        if (rt->manager)
 								                (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
 								        /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
 								        if (destroy && rt->tmp_dir) {
 								                log_debug("Spawning thread to nuke %s", rt->tmp_dir);
 								                r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
 								                if (r < 0) {
 								                        log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
 								                        free(rt->tmp_dir);
 								                }
 								                rt->tmp_dir = NULL;
 								        }
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        if (destroy && rt->var_tmp_dir) {
 								                log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
 								                r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
 								                if (r < 0) {
 								                        log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
 								                        free(rt->var_tmp_dir);
 								                }
 								                rt->var_tmp_dir = NULL;
 								        }
 								        rt->id = mfree(rt->id);
 								        rt->tmp_dir = mfree(rt->tmp_dir);
 								        rt->var_tmp_dir = mfree(rt->var_tmp_dir);
 								        safe_close_pair(rt->netns_storage_socket);
 								        return mfree(rt);
 								}
 								static void exec_runtime_freep(ExecRuntime **rt) {
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        if (*rt)
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                (void) exec_runtime_free(*rt, false);
 								}
 								static int exec_runtime_allocate(ExecRuntime **rt) {
 								        assert(rt);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
 								        *rt = new0(ExecRuntime, 1);
-												core: Forgot to dereference pointer when checking for NULL

Actually we already checked for !rt before, now we'd like to examine
the return value of the memory allocation.

											
										
										
											2013-12-30 00:18:39 +01:00
+								        if (!*rt)
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                return -ENOMEM;
 								        (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
 								        return 0;
 								}
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								static int exec_runtime_add(
 								                Manager *m,
 								                const char *id,
 								                const char *tmp_dir,
 								                const char *var_tmp_dir,
 								                const int netns_storage_socket[2],
 								                ExecRuntime **ret) {
 								        _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        int r;
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        assert(m);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        assert(id);
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
 								        if (r < 0)
 								                return r;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        r = exec_runtime_allocate(&rt);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        if (r < 0)
 								                return r;
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        rt->id = strdup(id);
 								        if (!rt->id)
 								                return -ENOMEM;
 								        if (tmp_dir) {
 								                rt->tmp_dir = strdup(tmp_dir);
 								                if (!rt->tmp_dir)
 								                        return -ENOMEM;
 								                /* When tmp_dir is set, then we require var_tmp_dir is also set. */
 								                assert(var_tmp_dir);
 								                rt->var_tmp_dir = strdup(var_tmp_dir);
 								                if (!rt->var_tmp_dir)
 								                        return -ENOMEM;
 								        }
 								        if (netns_storage_socket) {
 								                rt->netns_storage_socket[0] = netns_storage_socket[0];
 								                rt->netns_storage_socket[1] = netns_storage_socket[1];
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        }
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
 								        if (r < 0)
 								                return r;
 								        rt->manager = m;
 								        if (ret)
 								                *ret = rt;
 								        /* do not remove created ExecRuntime object when the operation succeeds. */
 								        rt = NULL;
 								        return 0;
 								}
 								static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
 								        _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
 								        _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
 								        int r;
 								        assert(m);
 								        assert(c);
 								        assert(id);
 								        /* It is not necessary to create ExecRuntime object. */
 								        if (!c->private_network && !c->private_tmp)
 								                return 0;
 								        if (c->private_tmp) {
 								                r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                if (r < 0)
 								                        return r;
 								        }
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        if (c->private_network) {
 								                if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
 								                        return -errno;
 								        }
 								        r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
 								        if (r < 0)
 								                return r;
 								        /* Avoid cleanup */
 								        netns_storage_socket[0] = -1;
 								        netns_storage_socket[1] = -1;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        return 1;
 								}
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
 								        ExecRuntime *rt;
 								        int r;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        assert(m);
 								        assert(id);
 								        assert(ret);
 								        rt = hashmap_get(m->exec_runtime_by_id, id);
 								        if (rt)
 								                /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
 								                goto ref;
 								        if (!create)
 								                return 0;
 								        /* If not found, then create a new object. */
 								        r = exec_runtime_make(m, c, id, &rt);
 								        if (r <= 0)
 								                /* When r == 0, it is not necessary to create ExecRuntime object. */
 								                return r;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								ref:
 								        /* increment reference counter. */
 								        rt->n_ref++;
 								        *ret = rt;
 								        return 1;
 								}
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
 								        if (!rt)
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                return NULL;
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        assert(rt->n_ref > 0);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        rt->n_ref--;
 								        if (rt->n_ref > 0)
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                return NULL;
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        return exec_runtime_free(rt, destroy);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								}
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
 								        ExecRuntime *rt;
 								        Iterator i;
 								        assert(m);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        assert(f);
 								        assert(fds);
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
 								                fprintf(f, "exec-runtime=%s", rt->id);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                if (rt->tmp_dir)
 								                        fprintf(f, " tmp-dir=%s", rt->tmp_dir);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                if (rt->var_tmp_dir)
 								                        fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                if (rt->netns_storage_socket[0] >= 0) {
 								                        int copy;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                        copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
 								                        if (copy < 0)
 								                                return copy;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                        fprintf(f, " netns-socket-0=%i", copy);
 								                }
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                if (rt->netns_storage_socket[1] >= 0) {
 								                        int copy;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                        copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
 								                        if (copy < 0)
 								                                return copy;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                        fprintf(f, " netns-socket-1=%i", copy);
 								                }
 								                fputc('\n', f);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        }
 								        return 0;
 								}
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
 								        _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
 								        ExecRuntime *rt;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        int r;
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        /* This is for the migration from old (v237 or earlier) deserialization text.
 								         * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
 								         * Even if the ExecRuntime object originally created by the other unit, we cannot judge
 								         * so or not from the serialized text, then we always creates a new object owned by this. */
 								        assert(u);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        assert(key);
 								        assert(value);
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        /* Manager manages ExecRuntime objects by the unit id.
 								         * So, we omit the serialized text when the unit does not have id (yet?)... */
 								        if (isempty(u->id)) {
 								                log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
 								                return 0;
 								        }
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
 								        if (r < 0) {
 								                log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
 								                return 0;
 								        }
 								        rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
 								        if (!rt) {
 								                r = exec_runtime_allocate(&rt_create);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                if (r < 0)
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        return log_oom();
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                rt_create->id = strdup(u->id);
 								                if (!rt_create->id)
 								                        return log_oom();
 								                rt = rt_create;
 								        }
 								        if (streq(key, "tmp-dir")) {
 								                char *copy;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                copy = strdup(value);
 								                if (!copy)
 								                        return log_oom();
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                free_and_replace(rt->tmp_dir, copy);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
 								        } else if (streq(key, "var-tmp-dir")) {
 								                char *copy;
 								                copy = strdup(value);
 								                if (!copy)
 								                        return log_oom();
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                free_and_replace(rt->var_tmp_dir, copy);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
 								        } else if (streq(key, "netns-socket-0")) {
 								                int fd;
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        log_unit_debug(u, "Failed to parse netns socket value: %s", value);
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                        return 0;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                }
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
 								                safe_close(rt->netns_storage_socket[0]);
 								                rt->netns_storage_socket[0] = fdset_remove(fds, fd);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        } else if (streq(key, "netns-socket-1")) {
 								                int fd;
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        log_unit_debug(u, "Failed to parse netns socket value: %s", value);
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                        return 0;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                }
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
 								                safe_close(rt->netns_storage_socket[1]);
 								                rt->netns_storage_socket[1] = fdset_remove(fds, fd);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        } else
 								                return 0;
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
 								        if (rt_create) {
 								                r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
 								                if (r < 0) {
-												Correct a number of trivial typos.

											
										
										
											2018-06-18 22:43:12 +02:00
+								                        log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                        return 0;
 								                }
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                rt_create->manager = u->manager;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                /* Avoid cleanup */
 								                rt_create = NULL;
 								        }
-												execute: free directory path if we fail to remove it because we cannot allocate a thread

											
										
										
											2014-03-03 17:11:39 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        return 1;
 								}
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
 								        char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
 								        int r, fd0 = -1, fd1 = -1;
 								        const char *p, *v = value;
 								        size_t n;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        assert(m);
 								        assert(value);
 								        assert(fds);
-												execute: free directory path if we fail to remove it because we cannot allocate a thread

											
										
										
											2014-03-03 17:11:39 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        n = strcspn(v, " ");
 								        id = strndupa(v, n);
 								        if (v[n] != ' ')
 								                goto finalize;
 								        p = v + n + 1;
 								        v = startswith(p, "tmp-dir=");
 								        if (v) {
 								                n = strcspn(v, " ");
 								                tmp_dir = strndupa(v, n);
 								                if (v[n] != ' ')
 								                        goto finalize;
 								                p = v + n + 1;
 								        }
 								        v = startswith(p, "var-tmp-dir=");
 								        if (v) {
 								                n = strcspn(v, " ");
 								                var_tmp_dir = strndupa(v, n);
 								                if (v[n] != ' ')
 								                        goto finalize;
 								                p = v + n + 1;
 								        }
 								        v = startswith(p, "netns-socket-0=");
 								        if (v) {
 								                char *buf;
 								                n = strcspn(v, " ");
 								                buf = strndupa(v, n);
 								                if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
 								                        log_debug("Unable to process exec-runtime netns fd specification.");
 								                        return;
-												execute: free directory path if we fail to remove it because we cannot allocate a thread

											
										
										
											2014-03-03 17:11:39 +01:00
+								                }
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                fd0 = fdset_remove(fds, fd0);
 								                if (v[n] != ' ')
 								                        goto finalize;
 								                p = v + n + 1;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        }
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								        v = startswith(p, "netns-socket-1=");
 								        if (v) {
 								                char *buf;
-												execute: free directory path if we fail to remove it because we cannot allocate a thread

											
										
										
											2014-03-03 17:11:39 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                n = strcspn(v, " ");
 								                buf = strndupa(v, n);
 								                if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
 								                        log_debug("Unable to process exec-runtime netns fd specification.");
 								                        return;
-												execute: free directory path if we fail to remove it because we cannot allocate a thread

											
										
										
											2014-03-03 17:11:39 +01:00
+								                }
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								                fd1 = fdset_remove(fds, fd1);
 								        }
-												execute: free directory path if we fail to remove it because we cannot allocate a thread

											
										
										
											2014-03-03 17:11:39 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								finalize:
 								        r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
 								        if (r < 0) {
 								                log_debug_errno(r, "Failed to add exec-runtime: %m");
 								                return;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        }
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								}
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make ExecRuntime be manager managed object

Before this, each ExecRuntime object is owned by a unit. However,
it may be shared with other units which enable JoinsNamespaceOf=.
Thus, by the serialization/deserialization process, its sharing
information, more specifically, reference counter is lost, and
causes issue #7790.

This makes ExecRuntime objects be managed by manager, and changes
the serialization/deserialization process.

Fixes #7790.

											
										
										
											2018-02-06 08:00:34 +01:00
+								void exec_runtime_vacuum(Manager *m) {
 								        ExecRuntime *rt;
 								        Iterator i;
 								        assert(m);
 								        /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
 								        HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
 								                if (rt->n_ref > 0)
 								                        continue;
 								                (void) exec_runtime_free(rt, false);
 								        }
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								}
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
 								        [EXEC_INPUT_NULL] = "null",
 								        [EXEC_INPUT_TTY] = "tty",
 								        [EXEC_INPUT_TTY_FORCE] = "tty-force",
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								        [EXEC_INPUT_TTY_FAIL] = "tty-fail",
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        [EXEC_INPUT_SOCKET] = "socket",
 								        [EXEC_INPUT_NAMED_FD] = "fd",
-												core: add two new unit file settings: StandardInputData= + StandardInputText=

Both permit configuring data to pass through STDIN to an invoked
process. StandardInputText= accepts a line of text (possibly with
embedded C-style escapes as well as unit specifiers), which is appended
to the buffer to pass as stdin, followed by a single newline.
StandardInputData= is similar, but accepts arbitrary base64 encoded
data, and will not resolve specifiers or C-style escapes, nor append
newlines.

This may be used to pass input/configuration data to services, directly
in-line from unit files, either in a cooked or in a more raw format.

											
										
										
											2017-10-27 11:33:05 +02:00
+								        [EXEC_INPUT_DATA] = "data",
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								        [EXEC_INPUT_FILE] = "file",
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								};
-												systemctl: introduce systemctl kill

											
										
										
											2010-10-22 16:11:50 +02:00
+								DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        [EXEC_OUTPUT_INHERIT] = "inherit",
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        [EXEC_OUTPUT_NULL] = "null",
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        [EXEC_OUTPUT_TTY] = "tty",
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        [EXEC_OUTPUT_SYSLOG] = "syslog",
-												execute: optionally forward program output to /dev/console in addition to syslog/kmsg

											
										
										
											2011-02-15 01:27:53 +01:00
+								        [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
-												execute: s/EXEC_OUTPUT_KERNEL/EXEC_OUTPUT_KMSG/ to follow LOG_TARGET_xxx nomenclature

											
										
										
											2010-05-19 21:49:03 +02:00
+								        [EXEC_OUTPUT_KMSG] = "kmsg",
-												execute: optionally forward program output to /dev/console in addition to syslog/kmsg

											
										
										
											2011-02-15 01:27:53 +01:00
+								        [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
-												journal: introduce log target 'journal' for executed processes

											
										
										
											2012-01-05 23:54:45 +01:00
+								        [EXEC_OUTPUT_JOURNAL] = "journal",
 								        [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        [EXEC_OUTPUT_SOCKET] = "socket",
 								        [EXEC_OUTPUT_NAMED_FD] = "fd",
-												core: add support for StandardInputFile= and friends

These new settings permit specifiying arbitrary paths as
stdin/stdout/stderr locations. We try to open/create them as necessary.
Some special magic is applied:

1) if the same path is specified for both input and output/stderr, we'll
   open it only once O_RDWR, and duplicate them fd instead.

2) If we an AF_UNIX socket path is specified, we'll connect() to it,
   rather than open() it. This allows invoking systemd services with
   stdin/stdout/stderr connected to arbitrary foreign service sockets.

Fixes: #3991

											
										
										
											2017-10-27 16:09:57 +02:00
+								        [EXEC_OUTPUT_FILE] = "file",
-												Add support for opening files for appending

Addresses part of #8983

											
										
										
											2018-07-03 21:22:29 +02:00
+								        [EXEC_OUTPUT_FILE_APPEND] = "append",
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								};
 								DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
-												core: optionally create LOGIN_PROCESS or USER_PROCESS utmp entries

When generating utmp/wtmp entries, optionally add both LOGIN_PROCESS and
INIT_PROCESS entries or even all three of LOGIN_PROCESS, INIT_PROCESS
and USER_PROCESS entries, instead of just a single INIT_PROCESS entry.

With this change systemd may be used to not only invoke a getty directly
in a SysV-compliant way but alternatively also a login(1) implementation
or even forego getty and login entirely, and invoke arbitrary shells in
a way that they appear in who(1) or w(1).

This is preparation for a later commit that adds a "machinectl shell"
operation to invoke a shell in a container, in a way that is compatible
with who(1) and w(1).

											
										
										
											2015-08-23 13:14:04 +02:00
 								static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
 								        [EXEC_UTMP_INIT] = "init",
 								        [EXEC_UTMP_LOGIN] = "login",
 								        [EXEC_UTMP_USER] = "user",
 								};
 								DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
-												core: allow preserving contents of RuntimeDirectory= over process restart

This introduces RuntimeDirectoryPreserve= option which takes a boolean
argument or 'restart'.

Closes #6087.

											
										
										
											2017-07-17 09:22:25 +02:00
 								static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
 								        [EXEC_PRESERVE_NO] = "no",
 								        [EXEC_PRESERVE_YES] = "yes",
 								        [EXEC_PRESERVE_RESTART] = "restart",
 								};
 								DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
 								        [EXEC_DIRECTORY_STATE] = "StateDirectory",
 								        [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
 								        [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
 								        [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
 								};
 								DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
-												core: add new environment variable $RUNTIME_DIRECTORY= or friends

The variable is generated from RuntimeDirectory= or friends.
If multiple directories are set, then they are concatenated with
the separator ':'.

											
										
										
											2018-09-11 07:05:08 +02:00
+								static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
 								        [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
 								        [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
 								        [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
 								        [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
 								        [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
 								};
 								DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
 								        [EXEC_KEYRING_INHERIT] = "inherit",
 								        [EXEC_KEYRING_PRIVATE] = "private",
 								        [EXEC_KEYRING_SHARED] = "shared",
 								};
 								DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);