Systemd/src/core/execute.c

/***
  This file is part of systemd.

  Copyright 2010 Lennart Poettering

  systemd is free software; you can redistribute it and/or modify it
  under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or
  (at your option) any later version.

  systemd is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/

#include <errno.h>
#include <fcntl.h>
#include <glob.h>
#include <grp.h>
#include <poll.h>
#include <signal.h>
#include <string.h>
#include <sys/capability.h>
#include <sys/eventfd.h>
#include <sys/mman.h>
#include <sys/personality.h>
#include <sys/prctl.h>
#include <sys/shm.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/un.h>
#include <unistd.h>
#include <utmpx.h>

#if HAVE_PAM
#include <security/pam_appl.h>
#endif

#if HAVE_SELINUX
#include <selinux/selinux.h>
#endif

#if HAVE_SECCOMP
#include <seccomp.h>
#endif

#if HAVE_APPARMOR
#include <sys/apparmor.h>
#endif

#include "sd-messages.h"

#include "af-list.h"
#include "alloc-util.h"
#if HAVE_APPARMOR
#include "apparmor-util.h"
#endif
#include "async.h"
#include "barrier.h"
#include "cap-list.h"
#include "capability-util.h"
#include "chown-recursive.h"
#include "def.h"
#include "env-util.h"
#include "errno-list.h"
#include "execute.h"
#include "exit-status.h"
#include "fd-util.h"
#include "fileio.h"
#include "format-util.h"
#include "fs-util.h"
#include "glob-util.h"
#include "io-util.h"
#include "ioprio.h"
#include "label.h"
#include "log.h"
#include "macro.h"
#include "missing.h"
#include "mkdir.h"
#include "namespace.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
#include "rlimit-util.h"
#include "rm-rf.h"
#if HAVE_SECCOMP
#include "seccomp-util.h"
#endif
#include "securebits.h"
#include "securebits-util.h"
#include "selinux-util.h"
#include "signal-util.h"
#include "smack-util.h"
#include "special.h"
#include "string-table.h"
#include "string-util.h"
#include "strv.h"
#include "syslog-util.h"
#include "terminal-util.h"
#include "unit.h"
#include "user-util.h"
#include "util.h"
#include "utmp-wtmp.h"

#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)

/* This assumes there is a 'tty' group */
#define TTY_MODE 0620

#define SNDBUF_SIZE (8*1024*1024)

static int shift_fds(int fds[], unsigned n_fds) {
        int start, restart_from;

        if (n_fds <= 0)
                return 0;

        /* Modifies the fds array! (sorts it) */

        assert(fds);

        start = 0;
        for (;;) {
                int i;

                restart_from = -1;

                for (i = start; i < (int) n_fds; i++) {
                        int nfd;

                        /* Already at right index? */
                        if (fds[i] == i+3)
                                continue;

                        nfd = fcntl(fds[i], F_DUPFD, i + 3);
                        if (nfd < 0)
                                return -errno;

                        safe_close(fds[i]);
                        fds[i] = nfd;

                        /* Hmm, the fd we wanted isn't free? Then
                         * let's remember that and try again from here */
                        if (nfd != i+3 && restart_from < 0)
                                restart_from = i;
                }

                if (restart_from < 0)
                        break;

                start = restart_from;
        }

        return 0;
}

static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
        unsigned i, n_fds;
        int r;

        n_fds = n_storage_fds + n_socket_fds;
        if (n_fds <= 0)
                return 0;

        assert(fds);

        /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
         * O_NONBLOCK only applies to socket activation though. */

        for (i = 0; i < n_fds; i++) {

                if (i < n_socket_fds) {
                        r = fd_nonblock(fds[i], nonblock);
                        if (r < 0)
                                return r;
                }

                /* We unconditionally drop FD_CLOEXEC from the fds,
                 * since after all we want to pass these fds to our
                 * children */

                r = fd_cloexec(fds[i], false);
                if (r < 0)
                        return r;
        }

        return 0;
}

static const char *exec_context_tty_path(const ExecContext *context) {
        assert(context);

        if (context->stdio_as_fds)
                return NULL;

        if (context->tty_path)
                return context->tty_path;

        return "/dev/console";
}

static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
        const char *path;

        assert(context);

        path = exec_context_tty_path(context);

        if (context->tty_vhangup) {
                if (p && p->stdin_fd >= 0)
                        (void) terminal_vhangup_fd(p->stdin_fd);
                else if (path)
                        (void) terminal_vhangup(path);
        }

        if (context->tty_reset) {
                if (p && p->stdin_fd >= 0)
                        (void) reset_terminal_fd(p->stdin_fd, true);
                else if (path)
                        (void) reset_terminal(path);
        }

        if (context->tty_vt_disallocate && path)
                (void) vt_disallocate(path);
}

static bool is_terminal_input(ExecInput i) {
        return IN_SET(i,
                      EXEC_INPUT_TTY,
                      EXEC_INPUT_TTY_FORCE,
                      EXEC_INPUT_TTY_FAIL);
}

static bool is_terminal_output(ExecOutput o) {
        return IN_SET(o,
                      EXEC_OUTPUT_TTY,
                      EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
                      EXEC_OUTPUT_KMSG_AND_CONSOLE,
                      EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
}

static bool is_syslog_output(ExecOutput o) {
        return IN_SET(o,
                      EXEC_OUTPUT_SYSLOG,
                      EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
}

static bool is_kmsg_output(ExecOutput o) {
        return IN_SET(o,
                      EXEC_OUTPUT_KMSG,
                      EXEC_OUTPUT_KMSG_AND_CONSOLE);
}

static bool exec_context_needs_term(const ExecContext *c) {
        assert(c);

        /* Return true if the execution context suggests we should set $TERM to something useful. */

        if (is_terminal_input(c->std_input))
                return true;

        if (is_terminal_output(c->std_output))
                return true;

        if (is_terminal_output(c->std_error))
                return true;

        return !!c->tty_path;
}

static int open_null_as(int flags, int nfd) {
        int fd, r;

        assert(nfd >= 0);

        fd = open("/dev/null", flags|O_NOCTTY);
        if (fd < 0)
                return -errno;

        if (fd != nfd) {
                r = dup2(fd, nfd) < 0 ? -errno : nfd;
                safe_close(fd);
        } else
                r = nfd;

        return r;
}

static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
        static const union sockaddr_union sa = {
                .un.sun_family = AF_UNIX,
                .un.sun_path = "/run/systemd/journal/stdout",
        };
        uid_t olduid = UID_INVALID;
        gid_t oldgid = GID_INVALID;
        int r;

        if (gid_is_valid(gid)) {
                oldgid = getgid();

                if (setegid(gid) < 0)
                        return -errno;
        }

        if (uid_is_valid(uid)) {
                olduid = getuid();

                if (seteuid(uid) < 0) {
                        r = -errno;
                        goto restore_gid;
                }
        }

        r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;

        /* If we fail to restore the uid or gid, things will likely
           fail later on. This should only happen if an LSM interferes. */

        if (uid_is_valid(uid))
                (void) seteuid(olduid);

 restore_gid:
        if (gid_is_valid(gid))
                (void) setegid(oldgid);

        return r;
}

static int connect_logger_as(
                Unit *unit,
                const ExecContext *context,
                const ExecParameters *params,
                ExecOutput output,
                const char *ident,
                int nfd,
                uid_t uid,
                gid_t gid) {

        int fd, r;

        assert(context);
        assert(params);
        assert(output < _EXEC_OUTPUT_MAX);
        assert(ident);
        assert(nfd >= 0);

        fd = socket(AF_UNIX, SOCK_STREAM, 0);
        if (fd < 0)
                return -errno;

        r = connect_journal_socket(fd, uid, gid);
        if (r < 0)
                return r;

        if (shutdown(fd, SHUT_RD) < 0) {
                safe_close(fd);
                return -errno;
        }

        (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);

        dprintf(fd,
                "%s\n"
                "%s\n"
                "%i\n"
                "%i\n"
                "%i\n"
                "%i\n"
                "%i\n",
                context->syslog_identifier ?: ident,
                params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
                context->syslog_priority,
                !!context->syslog_level_prefix,
                is_syslog_output(output),
                is_kmsg_output(output),
                is_terminal_output(output));

        if (fd == nfd)
                return nfd;

        r = dup2(fd, nfd) < 0 ? -errno : nfd;
        safe_close(fd);

        return r;
}
static int open_terminal_as(const char *path, mode_t mode, int nfd) {
        int fd, r;

        assert(path);
        assert(nfd >= 0);

        fd = open_terminal(path, mode | O_NOCTTY);
        if (fd < 0)
                return fd;

        if (fd != nfd) {
                r = dup2(fd, nfd) < 0 ? -errno : nfd;
                safe_close(fd);
        } else
                r = nfd;

        return r;
}

static int fixup_input(ExecInput std_input, int socket_fd, bool apply_tty_stdin) {

        if (is_terminal_input(std_input) && !apply_tty_stdin)
                return EXEC_INPUT_NULL;

        if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
                return EXEC_INPUT_NULL;

        return std_input;
}

static int fixup_output(ExecOutput std_output, int socket_fd) {

        if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
                return EXEC_OUTPUT_INHERIT;

        return std_output;
}

static int setup_input(
                const ExecContext *context,
                const ExecParameters *params,
                int socket_fd,
                int named_iofds[3]) {

        ExecInput i;

        assert(context);
        assert(params);

        if (params->stdin_fd >= 0) {
                if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
                        return -errno;

                /* Try to make this the controlling tty, if it is a tty, and reset it */
                (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
                (void) reset_terminal_fd(STDIN_FILENO, true);

                return STDIN_FILENO;
        }

        i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);

        switch (i) {

        case EXEC_INPUT_NULL:
                return open_null_as(O_RDONLY, STDIN_FILENO);

        case EXEC_INPUT_TTY:
        case EXEC_INPUT_TTY_FORCE:
        case EXEC_INPUT_TTY_FAIL: {
                int fd, r;

                fd = acquire_terminal(exec_context_tty_path(context),
                                      i == EXEC_INPUT_TTY_FAIL,
                                      i == EXEC_INPUT_TTY_FORCE,
                                      false,
                                      USEC_INFINITY);
                if (fd < 0)
                        return fd;

                if (fd != STDIN_FILENO) {
                        r = dup2(fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
                        safe_close(fd);
                } else
                        r = STDIN_FILENO;

                return r;
        }

        case EXEC_INPUT_SOCKET:
                return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;

        case EXEC_INPUT_NAMED_FD:
                (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
                return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;

        default:
                assert_not_reached("Unknown input type");
        }
}

static int setup_output(
                Unit *unit,
                const ExecContext *context,
                const ExecParameters *params,
                int fileno,
                int socket_fd,
                int named_iofds[3],
                const char *ident,
                uid_t uid,
                gid_t gid,
                dev_t *journal_stream_dev,
                ino_t *journal_stream_ino) {

        ExecOutput o;
        ExecInput i;
        int r;

        assert(unit);
        assert(context);
        assert(params);
        assert(ident);
        assert(journal_stream_dev);
        assert(journal_stream_ino);

        if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {

                if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
                        return -errno;

                return STDOUT_FILENO;
        }

        if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
                if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
                        return -errno;

                return STDERR_FILENO;
        }

        i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
        o = fixup_output(context->std_output, socket_fd);

        if (fileno == STDERR_FILENO) {
                ExecOutput e;
                e = fixup_output(context->std_error, socket_fd);

                /* This expects the input and output are already set up */

                /* Don't change the stderr file descriptor if we inherit all
                 * the way and are not on a tty */
                if (e == EXEC_OUTPUT_INHERIT &&
                    o == EXEC_OUTPUT_INHERIT &&
                    i == EXEC_INPUT_NULL &&
                    !is_terminal_input(context->std_input) &&
                    getppid () != 1)
                        return fileno;

                /* Duplicate from stdout if possible */
                if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
                        return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;

                o = e;

        } else if (o == EXEC_OUTPUT_INHERIT) {
                /* If input got downgraded, inherit the original value */
                if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
                        return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);

                /* If the input is connected to anything that's not a /dev/null, inherit that... */
                if (i != EXEC_INPUT_NULL)
                        return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;

                /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
                if (getppid() != 1)
                        return fileno;

                /* We need to open /dev/null here anew, to get the right access mode. */
                return open_null_as(O_WRONLY, fileno);
        }

        switch (o) {

        case EXEC_OUTPUT_NULL:
                return open_null_as(O_WRONLY, fileno);

        case EXEC_OUTPUT_TTY:
                if (is_terminal_input(i))
                        return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;

                /* We don't reset the terminal if this is just about output */
                return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);

        case EXEC_OUTPUT_SYSLOG:
        case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
        case EXEC_OUTPUT_KMSG:
        case EXEC_OUTPUT_KMSG_AND_CONSOLE:
        case EXEC_OUTPUT_JOURNAL:
        case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
                r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
                if (r < 0) {
                        log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
                        r = open_null_as(O_WRONLY, fileno);
                } else {
                        struct stat st;

                        /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
                         * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
                         * services to detect whether they are connected to the journal or not.
                         *
                         * If both stdout and stderr are connected to a stream then let's make sure to store the data
                         * about STDERR as that's usually the best way to do logging. */

                        if (fstat(fileno, &st) >= 0 &&
                            (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
                                *journal_stream_dev = st.st_dev;
                                *journal_stream_ino = st.st_ino;
                        }
                }
                return r;

        case EXEC_OUTPUT_SOCKET:
                assert(socket_fd >= 0);
                return dup2(socket_fd, fileno) < 0 ? -errno : fileno;

        case EXEC_OUTPUT_NAMED_FD:
                (void) fd_nonblock(named_iofds[fileno], false);
                return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;

        default:
                assert_not_reached("Unknown error type");
        }
}

static int chown_terminal(int fd, uid_t uid) {
        struct stat st;

        assert(fd >= 0);

        /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
        if (isatty(fd) < 1)
                return 0;

        /* This might fail. What matters are the results. */
        (void) fchown(fd, uid, -1);
        (void) fchmod(fd, TTY_MODE);

        if (fstat(fd, &st) < 0)
                return -errno;

        if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
                return -EPERM;

        return 0;
}

static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
        _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
        int r;

        assert(_saved_stdin);
        assert(_saved_stdout);

        saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
        if (saved_stdin < 0)
                return -errno;

        saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
        if (saved_stdout < 0)
                return -errno;

        fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
        if (fd < 0)
                return fd;

        r = chown_terminal(fd, getuid());
        if (r < 0)
                return r;

        r = reset_terminal_fd(fd, true);
        if (r < 0)
                return r;

        if (dup2(fd, STDIN_FILENO) < 0)
                return -errno;

        if (dup2(fd, STDOUT_FILENO) < 0)
                return -errno;

        if (fd >= 2)
                safe_close(fd);
        fd = -1;

        *_saved_stdin = saved_stdin;
        *_saved_stdout = saved_stdout;

        saved_stdin = saved_stdout = -1;

        return 0;
}

static void write_confirm_error_fd(int err, int fd, const Unit *u) {
        assert(err < 0);

        if (err == -ETIMEDOUT)
                dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
        else {
                errno = -err;
                dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
        }
}

static void write_confirm_error(int err, const char *vc, const Unit *u) {
        _cleanup_close_ int fd = -1;

        assert(vc);

        fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
        if (fd < 0)
                return;

        write_confirm_error_fd(err, fd, u);
}

static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
        int r = 0;

        assert(saved_stdin);
        assert(saved_stdout);

        release_terminal();

        if (*saved_stdin >= 0)
                if (dup2(*saved_stdin, STDIN_FILENO) < 0)
                        r = -errno;

        if (*saved_stdout >= 0)
                if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
                        r = -errno;

        *saved_stdin = safe_close(*saved_stdin);
        *saved_stdout = safe_close(*saved_stdout);

        return r;
}

enum {
        CONFIRM_PRETEND_FAILURE = -1,
        CONFIRM_PRETEND_SUCCESS =  0,
        CONFIRM_EXECUTE = 1,
};

static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
        int saved_stdout = -1, saved_stdin = -1, r;
        _cleanup_free_ char *e = NULL;
        char c;

        /* For any internal errors, assume a positive response. */
        r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
        if (r < 0) {
                write_confirm_error(r, vc, u);
                return CONFIRM_EXECUTE;
        }

        /* confirm_spawn might have been disabled while we were sleeping. */
        if (manager_is_confirm_spawn_disabled(u->manager)) {
                r = 1;
                goto restore_stdio;
        }

        e = ellipsize(cmdline, 60, 100);
        if (!e) {
                log_oom();
                r = CONFIRM_EXECUTE;
                goto restore_stdio;
        }

        for (;;) {
                r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
                if (r < 0) {
                        write_confirm_error_fd(r, STDOUT_FILENO, u);
                        r = CONFIRM_EXECUTE;
                        goto restore_stdio;
                }

                switch (c) {
                case 'c':
                        printf("Resuming normal execution.\n");
                        manager_disable_confirm_spawn();
                        r = 1;
                        break;
                case 'D':
                        unit_dump(u, stdout, "  ");
                        continue; /* ask again */
                case 'f':
                        printf("Failing execution.\n");
                        r = CONFIRM_PRETEND_FAILURE;
                        break;
                case 'h':
                        printf("  c - continue, proceed without asking anymore\n"
                               "  D - dump, show the state of the unit\n"
                               "  f - fail, don't execute the command and pretend it failed\n"
                               "  h - help\n"
                               "  i - info, show a short summary of the unit\n"
                               "  j - jobs, show jobs that are in progress\n"
                               "  s - skip, don't execute the command and pretend it succeeded\n"
                               "  y - yes, execute the command\n");
                        continue; /* ask again */
                case 'i':
                        printf("  Description: %s\n"
                               "  Unit:        %s\n"
                               "  Command:     %s\n",
                               u->id, u->description, cmdline);
                        continue; /* ask again */
                case 'j':
                        manager_dump_jobs(u->manager, stdout, "  ");
                        continue; /* ask again */
                case 'n':
                        /* 'n' was removed in favor of 'f'. */
                        printf("Didn't understand 'n', did you mean 'f'?\n");
                        continue; /* ask again */
                case 's':
                        printf("Skipping execution.\n");
                        r = CONFIRM_PRETEND_SUCCESS;
                        break;
                case 'y':
                        r = CONFIRM_EXECUTE;
                        break;
                default:
                        assert_not_reached("Unhandled choice");
                }
                break;
        }

restore_stdio:
        restore_confirm_stdio(&saved_stdin, &saved_stdout);
        return r;
}

static int get_fixed_user(const ExecContext *c, const char **user,
                          uid_t *uid, gid_t *gid,
                          const char **home, const char **shell) {
        int r;
        const char *name;

        assert(c);

        if (!c->user)
                return 0;

        /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
         * (i.e. are "/" or "/bin/nologin"). */

        name = c->user;
        r = get_user_creds_clean(&name, uid, gid, home, shell);
        if (r < 0)
                return r;

        *user = name;
        return 0;
}

static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
        int r;
        const char *name;

        assert(c);

        if (!c->group)
                return 0;

        name = c->group;
        r = get_group_creds(&name, gid);
        if (r < 0)
                return r;

        *group = name;
        return 0;
}

static int get_supplementary_groups(const ExecContext *c, const char *user,
                                    const char *group, gid_t gid,
                                    gid_t **supplementary_gids, int *ngids) {
        char **i;
        int r, k = 0;
        int ngroups_max;
        bool keep_groups = false;
        gid_t *groups = NULL;
        _cleanup_free_ gid_t *l_gids = NULL;

        assert(c);

        /*
         * If user is given, then lookup GID and supplementary groups list.
         * We avoid NSS lookups for gid=0. Also we have to initialize groups
         * here and as early as possible so we keep the list of supplementary
         * groups of the caller.
         */
        if (user && gid_is_valid(gid) && gid != 0) {
                /* First step, initialize groups from /etc/groups */
                if (initgroups(user, gid) < 0)
                        return -errno;

                keep_groups = true;
        }

        if (strv_isempty(c->supplementary_groups))
                return 0;

        /*
         * If SupplementaryGroups= was passed then NGROUPS_MAX has to
         * be positive, otherwise fail.
         */
        errno = 0;
        ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
        if (ngroups_max <= 0) {
                if (errno > 0)
                        return -errno;
                else
                        return -EOPNOTSUPP; /* For all other values */
        }

        l_gids = new(gid_t, ngroups_max);
        if (!l_gids)
                return -ENOMEM;

        if (keep_groups) {
                /*
                 * Lookup the list of groups that the user belongs to, we
                 * avoid NSS lookups here too for gid=0.
                 */
                k = ngroups_max;
                if (getgrouplist(user, gid, l_gids, &k) < 0)
                        return -EINVAL;
        } else
                k = 0;

        STRV_FOREACH(i, c->supplementary_groups) {
                const char *g;

                if (k >= ngroups_max)
                        return -E2BIG;

                g = *i;
                r = get_group_creds(&g, l_gids+k);
                if (r < 0)
                        return r;

                k++;
        }

        /*
         * Sets ngids to zero to drop all supplementary groups, happens
         * when we are under root and SupplementaryGroups= is empty.
         */
        if (k == 0) {
                *ngids = 0;
                return 0;
        }

        /* Otherwise get the final list of supplementary groups */
        groups = memdup(l_gids, sizeof(gid_t) * k);
        if (!groups)
                return -ENOMEM;

        *supplementary_gids = groups;
        *ngids = k;

        groups = NULL;

        return 0;
}

static int enforce_groups(gid_t gid, gid_t *supplementary_gids, int ngids) {
        int r;

        /* Handle SupplementaryGroups= if it is not empty */
        if (ngids > 0) {
                r = maybe_setgroups(ngids, supplementary_gids);
                if (r < 0)
                        return r;
        }

        if (gid_is_valid(gid)) {
                /* Then set our gids */
                if (setresgid(gid, gid, gid) < 0)
                        return -errno;
        }

        return 0;
}

static int enforce_user(const ExecContext *context, uid_t uid) {
        assert(context);

        if (!uid_is_valid(uid))
                return 0;

        /* Sets (but doesn't look up) the uid and make sure we keep the
         * capabilities while doing so. */

        if (context->capability_ambient_set != 0) {

                /* First step: If we need to keep capabilities but
                 * drop privileges we need to make sure we keep our
                 * caps, while we drop privileges. */
                if (uid != 0) {
                        int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;

                        if (prctl(PR_GET_SECUREBITS) != sb)
                                if (prctl(PR_SET_SECUREBITS, sb) < 0)
                                        return -errno;
                }
        }

        /* Second step: actually set the uids */
        if (setresuid(uid, uid, uid) < 0)
                return -errno;

        /* At this point we should have all necessary capabilities but
           are otherwise a normal user. However, the caps might got
           corrupted due to the setresuid() so we need clean them up
           later. This is done outside of this call. */

        return 0;
}

#if HAVE_PAM

static int null_conv(
                int num_msg,
                const struct pam_message **msg,
                struct pam_response **resp,
                void *appdata_ptr) {

        /* We don't support conversations */

        return PAM_CONV_ERR;
}

#endif

static int setup_pam(
                const char *name,
                const char *user,
                uid_t uid,
                gid_t gid,
                const char *tty,
                char ***env,
                int fds[], unsigned n_fds) {

#if HAVE_PAM

        static const struct pam_conv conv = {
                .conv = null_conv,
                .appdata_ptr = NULL
        };

        _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
        pam_handle_t *handle = NULL;
        sigset_t old_ss;
        int pam_code = PAM_SUCCESS, r;
        char **nv, **e = NULL;
        bool close_session = false;
        pid_t pam_pid = 0, parent_pid;
        int flags = 0;

        assert(name);
        assert(user);
        assert(env);

        /* We set up PAM in the parent process, then fork. The child
         * will then stay around until killed via PR_GET_PDEATHSIG or
         * systemd via the cgroup logic. It will then remove the PAM
         * session again. The parent process will exec() the actual
         * daemon. We do things this way to ensure that the main PID
         * of the daemon is the one we initially fork()ed. */

        r = barrier_create(&barrier);
        if (r < 0)
                goto fail;

        if (log_get_max_level() < LOG_DEBUG)
                flags |= PAM_SILENT;

        pam_code = pam_start(name, user, &conv, &handle);
        if (pam_code != PAM_SUCCESS) {
                handle = NULL;
                goto fail;
        }

        if (tty) {
                pam_code = pam_set_item(handle, PAM_TTY, tty);
                if (pam_code != PAM_SUCCESS)
                        goto fail;
        }

        STRV_FOREACH(nv, *env) {
                pam_code = pam_putenv(handle, *nv);
                if (pam_code != PAM_SUCCESS)
                        goto fail;
        }

        pam_code = pam_acct_mgmt(handle, flags);
        if (pam_code != PAM_SUCCESS)
                goto fail;

        pam_code = pam_open_session(handle, flags);
        if (pam_code != PAM_SUCCESS)
                goto fail;

        close_session = true;

        e = pam_getenvlist(handle);
        if (!e) {
                pam_code = PAM_BUF_ERR;
                goto fail;
        }

        /* Block SIGTERM, so that we know that it won't get lost in
         * the child */

        assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);

        parent_pid = getpid_cached();

        pam_pid = fork();
        if (pam_pid < 0) {
                r = -errno;
                goto fail;
        }

        if (pam_pid == 0) {
                int sig, ret = EXIT_PAM;

                /* The child's job is to reset the PAM session on
                 * termination */
                barrier_set_role(&barrier, BARRIER_CHILD);

                /* This string must fit in 10 chars (i.e. the length
                 * of "/sbin/init"), to look pretty in /bin/ps */
                rename_process("(sd-pam)");

                /* Make sure we don't keep open the passed fds in this
                child. We assume that otherwise only those fds are
                open here that have been opened by PAM. */
                close_many(fds, n_fds);

                /* Drop privileges - we don't need any to pam_close_session
                 * and this will make PR_SET_PDEATHSIG work in most cases.
                 * If this fails, ignore the error - but expect sd-pam threads
                 * to fail to exit normally */

                r = maybe_setgroups(0, NULL);
                if (r < 0)
                        log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
                if (setresgid(gid, gid, gid) < 0)
                        log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
                if (setresuid(uid, uid, uid) < 0)
                        log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");

                (void) ignore_signals(SIGPIPE, -1);

                /* Wait until our parent died. This will only work if
                 * the above setresuid() succeeds, otherwise the kernel
                 * will not allow unprivileged parents kill their privileged
                 * children this way. We rely on the control groups kill logic
                 * to do the rest for us. */
                if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
                        goto child_finish;

                /* Tell the parent that our setup is done. This is especially
                 * important regarding dropping privileges. Otherwise, unit
                 * setup might race against our setresuid(2) call.
                 *
                 * If the parent aborted, we'll detect this below, hence ignore
                 * return failure here. */
                (void) barrier_place(&barrier);

                /* Check if our parent process might already have died? */
                if (getppid() == parent_pid) {
                        sigset_t ss;

                        assert_se(sigemptyset(&ss) >= 0);
                        assert_se(sigaddset(&ss, SIGTERM) >= 0);

                        for (;;) {
                                if (sigwait(&ss, &sig) < 0) {
                                        if (errno == EINTR)
                                                continue;

                                        goto child_finish;
                                }

                                assert(sig == SIGTERM);
                                break;
                        }
                }

                /* If our parent died we'll end the session */
                if (getppid() != parent_pid) {
                        pam_code = pam_close_session(handle, flags);
                        if (pam_code != PAM_SUCCESS)
                                goto child_finish;
                }

                ret = 0;

        child_finish:
                pam_end(handle, pam_code | flags);
                _exit(ret);
        }

        barrier_set_role(&barrier, BARRIER_PARENT);

        /* If the child was forked off successfully it will do all the
         * cleanups, so forget about the handle here. */
        handle = NULL;

        /* Unblock SIGTERM again in the parent */
        assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);

        /* We close the log explicitly here, since the PAM modules
         * might have opened it, but we don't want this fd around. */
        closelog();

        /* Synchronously wait for the child to initialize. We don't care for
         * errors as we cannot recover. However, warn loudly if it happens. */
        if (!barrier_place_and_sync(&barrier))
                log_error("PAM initialization failed");

        strv_free(*env);
        *env = e;

        return 0;

fail:
        if (pam_code != PAM_SUCCESS) {
                log_error("PAM failed: %s", pam_strerror(handle, pam_code));
                r = -EPERM;  /* PAM errors do not map to errno */
        } else
                log_error_errno(r, "PAM failed: %m");

        if (handle) {
                if (close_session)
                        pam_code = pam_close_session(handle, flags);

                pam_end(handle, pam_code | flags);
        }

        strv_free(e);
        closelog();

        return r;
#else
        return 0;
#endif
}

static void rename_process_from_path(const char *path) {
        char process_name[11];
        const char *p;
        size_t l;

        /* This resulting string must fit in 10 chars (i.e. the length
         * of "/sbin/init") to look pretty in /bin/ps */

        p = basename(path);
        if (isempty(p)) {
                rename_process("(...)");
                return;
        }

        l = strlen(p);
        if (l > 8) {
                /* The end of the process name is usually more
                 * interesting, since the first bit might just be
                 * "systemd-" */
                p = p + l - 8;
                l = 8;
        }

        process_name[0] = '(';
        memcpy(process_name+1, p, l);
        process_name[1+l] = ')';
        process_name[1+l+1] = 0;

        rename_process(process_name);
}

static bool context_has_address_families(const ExecContext *c) {
        assert(c);

        return c->address_families_whitelist ||
                !set_isempty(c->address_families);
}

static bool context_has_syscall_filters(const ExecContext *c) {
        assert(c);

        return c->syscall_whitelist ||
                !set_isempty(c->syscall_filter);
}

static bool context_has_no_new_privileges(const ExecContext *c) {
        assert(c);

        if (c->no_new_privileges)
                return true;

        if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
                return false;

        /* We need NNP if we have any form of seccomp and are unprivileged */
        return context_has_address_families(c) ||
                c->memory_deny_write_execute ||
                c->restrict_realtime ||
                exec_context_restrict_namespaces_set(c) ||
                c->protect_kernel_tunables ||
                c->protect_kernel_modules ||
                c->private_devices ||
                context_has_syscall_filters(c) ||
                !set_isempty(c->syscall_archs) ||
                c->lock_personality;
}

#if HAVE_SECCOMP

static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {

        if (is_seccomp_available())
                return false;

        log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
        return true;
}

static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
        uint32_t negative_action, default_action, action;
        int r;

        assert(u);
        assert(c);

        if (!context_has_syscall_filters(c))
                return 0;

        if (skip_seccomp_unavailable(u, "SystemCallFilter="))
                return 0;

        negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);

        if (c->syscall_whitelist) {
                default_action = negative_action;
                action = SCMP_ACT_ALLOW;
        } else {
                default_action = SCMP_ACT_ALLOW;
                action = negative_action;
        }

        if (needs_ambient_hack) {
                r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
                if (r < 0)
                        return r;
        }

        return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
}

static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
        assert(u);
        assert(c);

        if (set_isempty(c->syscall_archs))
                return 0;

        if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
                return 0;

        return seccomp_restrict_archs(c->syscall_archs);
}

static int apply_address_families(const Unit* u, const ExecContext *c) {
        assert(u);
        assert(c);

        if (!context_has_address_families(c))
                return 0;

        if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
                return 0;

        return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
}

static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
        assert(u);
        assert(c);

        if (!c->memory_deny_write_execute)
                return 0;

        if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
                return 0;

        return seccomp_memory_deny_write_execute();
}

static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
        assert(u);
        assert(c);

        if (!c->restrict_realtime)
                return 0;

        if (skip_seccomp_unavailable(u, "RestrictRealtime="))
                return 0;

        return seccomp_restrict_realtime();
}

static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
        assert(u);
        assert(c);

        /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
         * let's protect even those systems where this is left on in the kernel. */

        if (!c->protect_kernel_tunables)
                return 0;

        if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
                return 0;

        return seccomp_protect_sysctl();
}

static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
        assert(u);
        assert(c);

        /* Turn off module syscalls on ProtectKernelModules=yes */

        if (!c->protect_kernel_modules)
                return 0;

        if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
                return 0;

        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
}

static int apply_private_devices(const Unit *u, const ExecContext *c) {
        assert(u);
        assert(c);

        /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */

        if (!c->private_devices)
                return 0;

        if (skip_seccomp_unavailable(u, "PrivateDevices="))
                return 0;

        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
}

static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
        assert(u);
        assert(c);

        if (!exec_context_restrict_namespaces_set(c))
                return 0;

        if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
                return 0;

        return seccomp_restrict_namespaces(c->restrict_namespaces);
}

static int apply_lock_personality(const Unit* u, const ExecContext *c) {
        unsigned long personality;
        int r;

        assert(u);
        assert(c);

        if (!c->lock_personality)
                return 0;

        if (skip_seccomp_unavailable(u, "LockPersonality="))
                return 0;

        personality = c->personality;

        /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
        if (personality == PERSONALITY_INVALID) {

                r = opinionated_personality(&personality);
                if (r < 0)
                        return r;
        }

        return seccomp_lock_personality(personality);
}

#endif

static void do_idle_pipe_dance(int idle_pipe[4]) {
        assert(idle_pipe);

        idle_pipe[1] = safe_close(idle_pipe[1]);
        idle_pipe[2] = safe_close(idle_pipe[2]);

        if (idle_pipe[0] >= 0) {
                int r;

                r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);

                if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
                        ssize_t n;

                        /* Signal systemd that we are bored and want to continue. */
                        n = write(idle_pipe[3], "x", 1);
                        if (n > 0)
                                /* Wait for systemd to react to the signal above. */
                                fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
                }

                idle_pipe[0] = safe_close(idle_pipe[0]);

        }

        idle_pipe[3] = safe_close(idle_pipe[3]);
}

static int build_environment(
                Unit *u,
                const ExecContext *c,
                const ExecParameters *p,
                unsigned n_fds,
                const char *home,
                const char *username,
                const char *shell,
                dev_t journal_stream_dev,
                ino_t journal_stream_ino,
                char ***ret) {

        _cleanup_strv_free_ char **our_env = NULL;
        unsigned n_env = 0;
        char *x;

        assert(u);
        assert(c);
        assert(ret);

        our_env = new0(char*, 14);
        if (!our_env)
                return -ENOMEM;

        if (n_fds > 0) {
                _cleanup_free_ char *joined = NULL;

                if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
                        return -ENOMEM;
                our_env[n_env++] = x;

                if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
                        return -ENOMEM;
                our_env[n_env++] = x;

                joined = strv_join(p->fd_names, ":");
                if (!joined)
                        return -ENOMEM;

                x = strjoin("LISTEN_FDNAMES=", joined);
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
                if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
                        return -ENOMEM;
                our_env[n_env++] = x;

                if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
         * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
         * check the database directly. */
        if (p->flags & EXEC_NSS_BYPASS_BUS) {
                x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        if (home) {
                x = strappend("HOME=", home);
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        if (username) {
                x = strappend("LOGNAME=", username);
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;

                x = strappend("USER=", username);
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        if (shell) {
                x = strappend("SHELL=", shell);
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        if (!sd_id128_is_null(u->invocation_id)) {
                if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
                        return -ENOMEM;

                our_env[n_env++] = x;
        }

        if (exec_context_needs_term(c)) {
                const char *tty_path, *term = NULL;

                tty_path = exec_context_tty_path(c);

                /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
                 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
                 * passes to PID 1 ends up all the way in the console login shown. */

                if (path_equal(tty_path, "/dev/console") && getppid() == 1)
                        term = getenv("TERM");
                if (!term)
                        term = default_term_for_tty(tty_path);

                x = strappend("TERM=", term);
                if (!x)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }

        if (journal_stream_dev != 0 && journal_stream_ino != 0) {
                if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
                        return -ENOMEM;

                our_env[n_env++] = x;
        }

        our_env[n_env++] = NULL;
        assert(n_env <= 12);

        *ret = our_env;
        our_env = NULL;

        return 0;
}

static int build_pass_environment(const ExecContext *c, char ***ret) {
        _cleanup_strv_free_ char **pass_env = NULL;
        size_t n_env = 0, n_bufsize = 0;
        char **i;

        STRV_FOREACH(i, c->pass_environment) {
                _cleanup_free_ char *x = NULL;
                char *v;

                v = getenv(*i);
                if (!v)
                        continue;
                x = strjoin(*i, "=", v);
                if (!x)
                        return -ENOMEM;

                if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
                        return -ENOMEM;

                pass_env[n_env++] = x;
                pass_env[n_env] = NULL;
                x = NULL;
        }

        *ret = pass_env;
        pass_env = NULL;

        return 0;
}

static bool exec_needs_mount_namespace(
                const ExecContext *context,
                const ExecParameters *params,
                ExecRuntime *runtime) {

        assert(context);
        assert(params);

        if (context->root_image)
                return true;

        if (!strv_isempty(context->read_write_paths) ||
            !strv_isempty(context->read_only_paths) ||
            !strv_isempty(context->inaccessible_paths))
                return true;

        if (context->n_bind_mounts > 0)
                return true;

        if (context->mount_flags != 0)
                return true;

        if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
                return true;

        if (context->private_devices ||
            context->protect_system != PROTECT_SYSTEM_NO ||
            context->protect_home != PROTECT_HOME_NO ||
            context->protect_kernel_tunables ||
            context->protect_kernel_modules ||
            context->protect_control_groups)
                return true;

        if (context->mount_apivfs && (context->root_image || context->root_directory))
                return true;

        if (context->dynamic_user &&
            (!strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) ||
             !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
             !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
             !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
                return true;

        return false;
}

static int setup_private_users(uid_t uid, gid_t gid) {
        _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
        _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
        _cleanup_close_ int unshare_ready_fd = -1;
        _cleanup_(sigkill_waitp) pid_t pid = 0;
        uint64_t c = 1;
        siginfo_t si;
        ssize_t n;
        int r;

        /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
         * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
         * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
         * which waits for the parent to create the new user namespace while staying in the original namespace. The
         * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
         * continues execution normally. */

        if (uid != 0 && uid_is_valid(uid)) {
                r = asprintf(&uid_map,
                             "0 0 1\n"                      /* Map root → root */
                             UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
                             uid, uid);
                if (r < 0)
                        return -ENOMEM;
        } else {
                uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
                if (!uid_map)
                        return -ENOMEM;
        }

        if (gid != 0 && gid_is_valid(gid)) {
                r = asprintf(&gid_map,
                             "0 0 1\n"                      /* Map root → root */
                             GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
                             gid, gid);
                if (r < 0)
                        return -ENOMEM;
        } else {
                gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
                if (!gid_map)
                        return -ENOMEM;
        }

        /* Create a communication channel so that the parent can tell the child when it finished creating the user
         * namespace. */
        unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
        if (unshare_ready_fd < 0)
                return -errno;

        /* Create a communication channel so that the child can tell the parent a proper error code in case it
         * failed. */
        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
                return -errno;

        pid = fork();
        if (pid < 0)
                return -errno;

        if (pid == 0) {
                _cleanup_close_ int fd = -1;
                const char *a;
                pid_t ppid;

                /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
                 * here, after the parent opened its own user namespace. */

                ppid = getppid();
                errno_pipe[0] = safe_close(errno_pipe[0]);

                /* Wait until the parent unshared the user namespace */
                if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
                        r = -errno;
                        goto child_fail;
                }

                /* Disable the setgroups() system call in the child user namespace, for good. */
                a = procfs_file_alloca(ppid, "setgroups");
                fd = open(a, O_WRONLY|O_CLOEXEC);
                if (fd < 0) {
                        if (errno != ENOENT) {
                                r = -errno;
                                goto child_fail;
                        }

                        /* If the file is missing the kernel is too old, let's continue anyway. */
                } else {
                        if (write(fd, "deny\n", 5) < 0) {
                                r = -errno;
                                goto child_fail;
                        }

                        fd = safe_close(fd);
                }

                /* First write the GID map */
                a = procfs_file_alloca(ppid, "gid_map");
                fd = open(a, O_WRONLY|O_CLOEXEC);
                if (fd < 0) {
                        r = -errno;
                        goto child_fail;
                }
                if (write(fd, gid_map, strlen(gid_map)) < 0) {
                        r = -errno;
                        goto child_fail;
                }
                fd = safe_close(fd);

                /* The write the UID map */
                a = procfs_file_alloca(ppid, "uid_map");
                fd = open(a, O_WRONLY|O_CLOEXEC);
                if (fd < 0) {
                        r = -errno;
                        goto child_fail;
                }
                if (write(fd, uid_map, strlen(uid_map)) < 0) {
                        r = -errno;
                        goto child_fail;
                }

                _exit(EXIT_SUCCESS);

        child_fail:
                (void) write(errno_pipe[1], &r, sizeof(r));
                _exit(EXIT_FAILURE);
        }

        errno_pipe[1] = safe_close(errno_pipe[1]);

        if (unshare(CLONE_NEWUSER) < 0)
                return -errno;

        /* Let the child know that the namespace is ready now */
        if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
                return -errno;

        /* Try to read an error code from the child */
        n = read(errno_pipe[0], &r, sizeof(r));
        if (n < 0)
                return -errno;
        if (n == sizeof(r)) { /* an error code was sent to us */
                if (r < 0)
                        return r;
                return -EIO;
        }
        if (n != 0) /* on success we should have read 0 bytes */
                return -EIO;

        r = wait_for_terminate(pid, &si);
        if (r < 0)
                return r;
        pid = 0;

        /* If something strange happened with the child, let's consider this fatal, too */
        if (si.si_code != CLD_EXITED || si.si_status != 0)
                return -EIO;

        return 0;
}

static int setup_exec_directory(
                const ExecContext *context,
                const ExecParameters *params,
                uid_t uid,
                gid_t gid,
                ExecDirectoryType type,
                int *exit_status) {

        static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
                [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
                [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
                [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
                [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
                [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
        };
        char **rt;
        int r;

        assert(context);
        assert(params);
        assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
        assert(exit_status);

        if (!params->prefix[type])
                return 0;

        if (params->flags & EXEC_CHOWN_DIRECTORIES) {
                if (!uid_is_valid(uid))
                        uid = 0;
                if (!gid_is_valid(gid))
                        gid = 0;
        }

        STRV_FOREACH(rt, context->directories[type].paths) {
                _cleanup_free_ char *p = NULL, *pp = NULL;
                const char *effective;

                p = strjoin(params->prefix[type], "/", *rt);
                if (!p) {
                        r = -ENOMEM;
                        goto fail;
                }

                r = mkdir_parents_label(p, 0755);
                if (r < 0)
                        goto fail;

                if (context->dynamic_user && type != EXEC_DIRECTORY_CONFIGURATION) {
                        _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;

                        /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
                         * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
                         * whose UID is later on reused. To lock this down we use the same trick used by container
                         * managers to prohibit host users to get access to files of the same UID in containers: we
                         * place everything inside a directory that has an access mode of 0700 and is owned root:root,
                         * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
                         * to make this directory permeable for the service itself.
                         *
                         * Specifically: for a service which wants a special directory "foo/" we first create a
                         * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
                         * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
                         * privileged host users can access "foo/" as usual, but unprivileged host users can't look
                         * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
                         * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
                         * disabling the access boundary for the service and making sure it only gets access to the
                         * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
                         *
                         * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
                         * owned by the service itself. */

                        private_root = strjoin(params->prefix[type], "/private");
                        if (!private_root) {
                                r = -ENOMEM;
                                goto fail;
                        }

                        /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
                        r = mkdir_safe_label(private_root, 0700, 0, 0, false);
                        if (r < 0)
                                goto fail;

                        pp = strjoin(private_root, "/", *rt);
                        if (!pp) {
                                r = -ENOMEM;
                                goto fail;
                        }

                        /* Create all directories between the configured directory and this private root, and mark them 0755 */
                        r = mkdir_parents_label(pp, 0755);
                        if (r < 0)
                                goto fail;

                        /* Finally, create the actual directory for the service */
                        r = mkdir_label(pp, context->directories[type].mode);
                        if (r < 0 && r != -EEXIST)
                                goto fail;

                        parent = dirname_malloc(p);
                        if (!parent) {
                                r = -ENOMEM;
                                goto fail;
                        }

                        r = path_make_relative(parent, pp, &relative);
                        if (r < 0)
                                goto fail;

                        /* And link it up from the original place */
                        r = symlink_idempotent(relative, p);
                        if (r < 0)
                                goto fail;

                        effective = pp;

                } else {
                        r = mkdir_label(p, context->directories[type].mode);
                        if (r < 0 && r != -EEXIST)
                                goto fail;

                        effective = p;
                }

                /* First lock down the access mode */
                if (chmod(effective, context->directories[type].mode) < 0) {
                        r = -errno;
                        goto fail;
                }

                /* Don't change the owner of the configuration directory, as in the common case it is not written to by
                 * a service, and shall not be writable. */
                if (type == EXEC_DIRECTORY_CONFIGURATION)
                        continue;

                /* Then, change the ownership of the whole tree, if necessary */
                r = path_chown_recursive(effective, uid, gid);
                if (r < 0)
                        goto fail;
        }

        return 0;

fail:
        *exit_status = exit_status_table[type];
        return r;
}

static int setup_smack(
                const ExecContext *context,
                const ExecCommand *command) {

        int r;

        assert(context);
        assert(command);

        if (context->smack_process_label) {
                r = mac_smack_apply_pid(0, context->smack_process_label);
                if (r < 0)
                        return r;
        }
#ifdef SMACK_DEFAULT_PROCESS_LABEL
        else {
                _cleanup_free_ char *exec_label = NULL;

                r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
                if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
                        return r;

                r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
                if (r < 0)
                        return r;
        }
#endif

        return 0;
}

static int compile_bind_mounts(
                const ExecContext *context,
                const ExecParameters *params,
                BindMount **ret_bind_mounts,
                unsigned *ret_n_bind_mounts,
                char ***ret_empty_directories) {

        _cleanup_strv_free_ char **empty_directories = NULL;
        BindMount *bind_mounts;
        unsigned n, h = 0, i;
        ExecDirectoryType t;
        int r;

        assert(context);
        assert(params);
        assert(ret_bind_mounts);
        assert(ret_n_bind_mounts);
        assert(ret_empty_directories);

        n = context->n_bind_mounts;
        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
                if (!params->prefix[t])
                        continue;

                n += strv_length(context->directories[t].paths);
        }

        if (n <= 0) {
                *ret_bind_mounts = NULL;
                *ret_n_bind_mounts = 0;
                *ret_empty_directories = NULL;
                return 0;
        }

        bind_mounts = new(BindMount, n);
        if (!bind_mounts)
                return -ENOMEM;

        for (i = 0; i < context->n_bind_mounts; i++) {
                BindMount *item = context->bind_mounts + i;
                char *s, *d;

                s = strdup(item->source);
                if (!s) {
                        r = -ENOMEM;
                        goto finish;
                }

                d = strdup(item->destination);
                if (!d) {
                        free(s);
                        r = -ENOMEM;
                        goto finish;
                }

                bind_mounts[h++] = (BindMount) {
                        .source = s,
                        .destination = d,
                        .read_only = item->read_only,
                        .recursive = item->recursive,
                        .ignore_enoent = item->ignore_enoent,
                };
        }

        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
                char **suffix;

                if (!params->prefix[t])
                        continue;

                if (strv_isempty(context->directories[t].paths))
                        continue;

                if (context->dynamic_user && t != EXEC_DIRECTORY_CONFIGURATION) {
                        char *private_root;

                        /* So this is for a dynamic user, and we need to make sure the process can access its own
                         * directory. For that we overmount the usually inaccessible "private" subdirectory with a
                         * tmpfs that makes it accessible and is empty except for the submounts we do this for. */

                        private_root = strjoin(params->prefix[t], "/private");
                        if (!private_root) {
                                r = -ENOMEM;
                                goto finish;
                        }

                        r = strv_consume(&empty_directories, private_root);
                        if (r < 0) {
                                r = -ENOMEM;
                                goto finish;
                        }
                }

                STRV_FOREACH(suffix, context->directories[t].paths) {
                        char *s, *d;

                        if (context->dynamic_user && t != EXEC_DIRECTORY_CONFIGURATION)
                                s = strjoin(params->prefix[t], "/private/", *suffix);
                        else
                                s = strjoin(params->prefix[t], "/", *suffix);
                        if (!s) {
                                r = -ENOMEM;
                                goto finish;
                        }

                        d = strdup(s);
                        if (!d) {
                                free(s);
                                r = -ENOMEM;
                                goto finish;
                        }

                        bind_mounts[h++] = (BindMount) {
                                .source = s,
                                .destination = d,
                                .read_only = false,
                                .recursive = true,
                                .ignore_enoent = false,
                        };
                }
        }

        assert(h == n);

        *ret_bind_mounts = bind_mounts;
        *ret_n_bind_mounts = n;
        *ret_empty_directories = empty_directories;

        empty_directories = NULL;

        return (int) n;

finish:
        bind_mount_free_many(bind_mounts, h);
        return r;
}

static int apply_mount_namespace(
                Unit *u,
                ExecCommand *command,
                const ExecContext *context,
                const ExecParameters *params,
                ExecRuntime *runtime) {

        _cleanup_strv_free_ char **empty_directories = NULL;
        char *tmp = NULL, *var = NULL;
        const char *root_dir = NULL, *root_image = NULL;
        NamespaceInfo ns_info = {
                .ignore_protect_paths = false,
                .private_dev = context->private_devices,
                .protect_control_groups = context->protect_control_groups,
                .protect_kernel_tunables = context->protect_kernel_tunables,
                .protect_kernel_modules = context->protect_kernel_modules,
                .mount_apivfs = context->mount_apivfs,
        };
        bool needs_sandboxing;
        BindMount *bind_mounts = NULL;
        unsigned n_bind_mounts = 0;
        int r;

        assert(context);

        /* The runtime struct only contains the parent of the private /tmp,
         * which is non-accessible to world users. Inside of it there's a /tmp
         * that is sticky, and that's the one we want to use here. */

        if (context->private_tmp && runtime) {
                if (runtime->tmp_dir)
                        tmp = strjoina(runtime->tmp_dir, "/tmp");
                if (runtime->var_tmp_dir)
                        var = strjoina(runtime->var_tmp_dir, "/tmp");
        }

        if (params->flags & EXEC_APPLY_CHROOT) {
                root_image = context->root_image;

                if (!root_image)
                        root_dir = context->root_directory;
        }

        r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
        if (r < 0)
                return r;

        /*
         * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
         * sandbox info, otherwise enforce it, don't ignore protected paths and
         * fail if we are enable to apply the sandbox inside the mount namespace.
         */
        if (!context->dynamic_user && root_dir)
                ns_info.ignore_protect_paths = true;

        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);

        r = setup_namespace(root_dir, root_image,
                            &ns_info, context->read_write_paths,
                            needs_sandboxing ? context->read_only_paths : NULL,
                            needs_sandboxing ? context->inaccessible_paths : NULL,
                            empty_directories,
                            bind_mounts,
                            n_bind_mounts,
                            tmp,
                            var,
                            needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
                            needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
                            context->mount_flags,
                            DISSECT_IMAGE_DISCARD_ON_LOOP);

        bind_mount_free_many(bind_mounts, n_bind_mounts);

        /* If we couldn't set up the namespace this is probably due to a
         * missing capability. In this case, silently proceeed. */
        if (IN_SET(r, -EPERM, -EACCES)) {
                log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
                return 0;
        }

        return r;
}

static int apply_working_directory(
                const ExecContext *context,
                const ExecParameters *params,
                const char *home,
                const bool needs_mount_ns,
                int *exit_status) {

        const char *d, *wd;

        assert(context);
        assert(exit_status);

        if (context->working_directory_home) {

                if (!home) {
                        *exit_status = EXIT_CHDIR;
                        return -ENXIO;
                }

                wd = home;

        } else if (context->working_directory)
                wd = context->working_directory;
        else
                wd = "/";

        if (params->flags & EXEC_APPLY_CHROOT) {
                if (!needs_mount_ns && context->root_directory)
                        if (chroot(context->root_directory) < 0) {
                                *exit_status = EXIT_CHROOT;
                                return -errno;
                        }

                d = wd;
        } else
                d = prefix_roota(context->root_directory, wd);

        if (chdir(d) < 0 && !context->working_directory_missing_ok) {
                *exit_status = EXIT_CHDIR;
                return -errno;
        }

        return 0;
}

static int setup_keyring(
                Unit *u,
                const ExecContext *context,
                const ExecParameters *p,
                uid_t uid, gid_t gid) {

        key_serial_t keyring;
        int r;

        assert(u);
        assert(context);
        assert(p);

        /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
         * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
         * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
         * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
         * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
         * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */

        if (!(p->flags & EXEC_NEW_KEYRING))
                return 0;

        if (context->keyring_mode == EXEC_KEYRING_INHERIT)
                return 0;

        keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
        if (keyring == -1) {
                if (errno == ENOSYS)
                        log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
                else if (IN_SET(errno, EACCES, EPERM))
                        log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
                else if (errno == EDQUOT)
                        log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
                else
                        return log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");

                return 0;
        }

        /* Populate they keyring with the invocation ID by default. */
        if (!sd_id128_is_null(u->invocation_id)) {
                key_serial_t key;

                key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
                if (key == -1)
                        log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
                else {
                        if (keyctl(KEYCTL_SETPERM, key,
                                   KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
                                   KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
                                return log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
                }
        }

        /* And now, make the keyring owned by the service's user */
        if (uid_is_valid(uid) || gid_is_valid(gid))
                if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
                        return log_unit_error_errno(u, errno, "Failed to change ownership of session keyring: %m");

        /* When requested link the user keyring into the session keyring. */
        if (context->keyring_mode == EXEC_KEYRING_SHARED) {
                uid_t saved_uid;
                gid_t saved_gid;

                /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
                 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
                 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/

                saved_uid = getuid();
                saved_gid = getgid();

                if (gid_is_valid(gid) && gid != saved_gid) {
                        if (setregid(gid, -1) < 0)
                                return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
                }

                if (uid_is_valid(uid) && uid != saved_uid) {
                        if (setreuid(uid, -1) < 0) {
                                (void) setregid(saved_gid, -1);
                                return log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
                        }
                }

                if (keyctl(KEYCTL_LINK,
                           KEY_SPEC_USER_KEYRING,
                           KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {

                        r = -errno;

                        (void) setreuid(saved_uid, -1);
                        (void) setregid(saved_gid, -1);

                        return log_unit_error_errno(u, r, "Failed to link user keyring into session keyring: %m");
                }

                if (uid_is_valid(uid) && uid != saved_uid) {
                        if (setreuid(saved_uid, -1) < 0) {
                                (void) setregid(saved_gid, -1);
                                return log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
                        }
                }

                if (gid_is_valid(gid) && gid != saved_gid) {
                        if (setregid(saved_gid, -1) < 0)
                                return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
                }
        }

        return 0;
}

static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
        assert(array);
        assert(n);

        if (!pair)
                return;

        if (pair[0] >= 0)
                array[(*n)++] = pair[0];
        if (pair[1] >= 0)
                array[(*n)++] = pair[1];
}

static int close_remaining_fds(
                const ExecParameters *params,
                ExecRuntime *runtime,
                DynamicCreds *dcreds,
                int user_lookup_fd,
                int socket_fd,
                int *fds, unsigned n_fds) {

        unsigned n_dont_close = 0;
        int dont_close[n_fds + 12];

        assert(params);

        if (params->stdin_fd >= 0)
                dont_close[n_dont_close++] = params->stdin_fd;
        if (params->stdout_fd >= 0)
                dont_close[n_dont_close++] = params->stdout_fd;
        if (params->stderr_fd >= 0)
                dont_close[n_dont_close++] = params->stderr_fd;

        if (socket_fd >= 0)
                dont_close[n_dont_close++] = socket_fd;
        if (n_fds > 0) {
                memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
                n_dont_close += n_fds;
        }

        if (runtime)
                append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);

        if (dcreds) {
                if (dcreds->user)
                        append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
                if (dcreds->group)
                        append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
        }

        if (user_lookup_fd >= 0)
                dont_close[n_dont_close++] = user_lookup_fd;

        return close_all_fds(dont_close, n_dont_close);
}

static int send_user_lookup(
                Unit *unit,
                int user_lookup_fd,
                uid_t uid,
                gid_t gid) {

        assert(unit);

        /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
         * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
         * specified. */

        if (user_lookup_fd < 0)
                return 0;

        if (!uid_is_valid(uid) && !gid_is_valid(gid))
                return 0;

        if (writev(user_lookup_fd,
               (struct iovec[]) {
                           IOVEC_INIT(&uid, sizeof(uid)),
                           IOVEC_INIT(&gid, sizeof(gid)),
                           IOVEC_INIT_STRING(unit->id) }, 3) < 0)
                return -errno;

        return 0;
}

static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
        int r;

        assert(c);
        assert(home);
        assert(buf);

        /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */

        if (*home)
                return 0;

        if (!c->working_directory_home)
                return 0;

        if (uid == 0) {
                /* Hardcode /root as home directory for UID 0 */
                *home = "/root";
                return 1;
        }

        r = get_home_dir(buf);
        if (r < 0)
                return r;

        *home = *buf;
        return 1;
}

static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
        _cleanup_strv_free_ char ** list = NULL;
        ExecDirectoryType t;
        int r;

        assert(c);
        assert(p);
        assert(ret);

        assert(c->dynamic_user);

        /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
         * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
         * directories. */

        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
                char **i;

                if (t == EXEC_DIRECTORY_CONFIGURATION)
                        continue;

                if (!p->prefix[t])
                        continue;

                STRV_FOREACH(i, c->directories[t].paths) {
                        char *e;

                        e = strjoin(p->prefix[t], "/private/", *i);
                        if (!e)
                                return -ENOMEM;

                        r = strv_consume(&list, e);
                        if (r < 0)
                                return r;
                }
        }

        *ret = list;
        list = NULL;

        return 0;
}

static int exec_child(
                Unit *unit,
                ExecCommand *command,
                const ExecContext *context,
                const ExecParameters *params,
                ExecRuntime *runtime,
                DynamicCreds *dcreds,
                char **argv,
                int socket_fd,
                int named_iofds[3],
                int *fds,
                unsigned n_storage_fds,
                unsigned n_socket_fds,
                char **files_env,
                int user_lookup_fd,
                int *exit_status) {

        _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
        _cleanup_free_ char *mac_selinux_context_net = NULL, *home_buffer = NULL;
        _cleanup_free_ gid_t *supplementary_gids = NULL;
        const char *username = NULL, *groupname = NULL;
        const char *home = NULL, *shell = NULL;
        dev_t journal_stream_dev = 0;
        ino_t journal_stream_ino = 0;
        bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
                needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
                needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
                needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
#if HAVE_SELINUX
        bool use_selinux = false;
#endif
#if ENABLE_SMACK
        bool use_smack = false;
#endif
#if HAVE_APPARMOR
        bool use_apparmor = false;
#endif
        uid_t uid = UID_INVALID;
        gid_t gid = GID_INVALID;
        int i, r, ngids = 0;
        unsigned n_fds;
        ExecDirectoryType dt;
        int secure_bits;

        assert(unit);
        assert(command);
        assert(context);
        assert(params);
        assert(exit_status);

        rename_process_from_path(command->path);

        /* We reset exactly these signals, since they are the
         * only ones we set to SIG_IGN in the main daemon. All
         * others we leave untouched because we set them to
         * SIG_DFL or a valid handler initially, both of which
         * will be demoted to SIG_DFL. */
        (void) default_signals(SIGNALS_CRASH_HANDLER,
                               SIGNALS_IGNORE, -1);

        if (context->ignore_sigpipe)
                (void) ignore_signals(SIGPIPE, -1);

        r = reset_signal_mask();
        if (r < 0) {
                *exit_status = EXIT_SIGNAL_MASK;
                return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
        }

        if (params->idle_pipe)
                do_idle_pipe_dance(params->idle_pipe);

        /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
         * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
         * any fds open we don't really want open during the transition. In order to make logging work, we switch the
         * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */

        log_forget_fds();
        log_set_open_when_needed(true);

        /* In case anything used libc syslog(), close this here, too */
        closelog();

        n_fds = n_storage_fds + n_socket_fds;
        r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
        if (r < 0) {
                *exit_status = EXIT_FDS;
                return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
        }

        if (!context->same_pgrp)
                if (setsid() < 0) {
                        *exit_status = EXIT_SETSID;
                        return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
                }

        exec_context_tty_reset(context, params);

        if (unit_shall_confirm_spawn(unit)) {
                const char *vc = params->confirm_spawn;
                _cleanup_free_ char *cmdline = NULL;

                cmdline = exec_command_line(argv);
                if (!cmdline) {
                        *exit_status = EXIT_MEMORY;
                        return log_oom();
                }

                r = ask_for_confirmation(vc, unit, cmdline);
                if (r != CONFIRM_EXECUTE) {
                        if (r == CONFIRM_PRETEND_SUCCESS) {
                                *exit_status = EXIT_SUCCESS;
                                return 0;
                        }
                        *exit_status = EXIT_CONFIRM;
                        log_unit_error(unit, "Execution cancelled by the user");
                        return -ECANCELED;
                }
        }

        if (context->dynamic_user && dcreds) {
                _cleanup_strv_free_ char **suggested_paths = NULL;

                /* Make sure we bypass our own NSS module for any NSS checks */
                if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
                        *exit_status = EXIT_USER;
                        return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
                }

                r = compile_suggested_paths(context, params, &suggested_paths);
                if (r < 0) {
                        *exit_status = EXIT_MEMORY;
                        return log_oom();
                }

                r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
                if (r < 0) {
                        *exit_status = EXIT_USER;
                        if (r == -EILSEQ) {
                                log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
                                return -EOPNOTSUPP;
                        }
                        return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
                }

                if (!uid_is_valid(uid)) {
                        *exit_status = EXIT_USER;
                        log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
                        return -ESRCH;
                }

                if (!gid_is_valid(gid)) {
                        *exit_status = EXIT_USER;
                        log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
                        return -ESRCH;
                }

                if (dcreds->user)
                        username = dcreds->user->name;

        } else {
                r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
                if (r < 0) {
                        *exit_status = EXIT_USER;
                        return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
                }

                r = get_fixed_group(context, &groupname, &gid);
                if (r < 0) {
                        *exit_status = EXIT_GROUP;
                        return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
                }
        }

        /* Initialize user supplementary groups and get SupplementaryGroups= ones */
        r = get_supplementary_groups(context, username, groupname, gid,
                                     &supplementary_gids, &ngids);
        if (r < 0) {
                *exit_status = EXIT_GROUP;
                return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
        }

        r = send_user_lookup(unit, user_lookup_fd, uid, gid);
        if (r < 0) {
                *exit_status = EXIT_USER;
                return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
        }

        user_lookup_fd = safe_close(user_lookup_fd);

        r = acquire_home(context, uid, &home, &home_buffer);
        if (r < 0) {
                *exit_status = EXIT_CHDIR;
                return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
        }

        /* If a socket is connected to STDIN/STDOUT/STDERR, we
         * must sure to drop O_NONBLOCK */
        if (socket_fd >= 0)
                (void) fd_nonblock(socket_fd, false);

        r = setup_input(context, params, socket_fd, named_iofds);
        if (r < 0) {
                *exit_status = EXIT_STDIN;
                return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
        }

        r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
        if (r < 0) {
                *exit_status = EXIT_STDOUT;
                return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
        }

        r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
        if (r < 0) {
                *exit_status = EXIT_STDERR;
                return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
        }

        if (params->cgroup_path) {
                r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
                if (r < 0) {
                        *exit_status = EXIT_CGROUP;
                        return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
                }
        }

        if (context->oom_score_adjust_set) {
                char t[DECIMAL_STR_MAX(context->oom_score_adjust)];

                /* When we can't make this change due to EPERM, then
                 * let's silently skip over it. User namespaces
                 * prohibit write access to this file, and we
                 * shouldn't trip up over that. */

                sprintf(t, "%i", context->oom_score_adjust);
                r = write_string_file("/proc/self/oom_score_adj", t, 0);
                if (IN_SET(r, -EPERM, -EACCES))
                        log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
                else if (r < 0) {
                        *exit_status = EXIT_OOM_ADJUST;
                        return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
                }
        }

        if (context->nice_set)
                if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
                        *exit_status = EXIT_NICE;
                        return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
                }

        if (context->cpu_sched_set) {
                struct sched_param param = {
                        .sched_priority = context->cpu_sched_priority,
                };

                r = sched_setscheduler(0,
                                       context->cpu_sched_policy |
                                       (context->cpu_sched_reset_on_fork ?
                                        SCHED_RESET_ON_FORK : 0),
                                       &param);
                if (r < 0) {
                        *exit_status = EXIT_SETSCHEDULER;
                        return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
                }
        }

        if (context->cpuset)
                if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
                        *exit_status = EXIT_CPUAFFINITY;
                        return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
                }

        if (context->ioprio_set)
                if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
                        *exit_status = EXIT_IOPRIO;
                        return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
                }

        if (context->timer_slack_nsec != NSEC_INFINITY)
                if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
                        *exit_status = EXIT_TIMERSLACK;
                        return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
                }

        if (context->personality != PERSONALITY_INVALID) {
                r = safe_personality(context->personality);
                if (r < 0) {
                        *exit_status = EXIT_PERSONALITY;
                        return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
                }
        }

        if (context->utmp_id)
                utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
                                      context->tty_path,
                                      context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
                                      context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
                                      USER_PROCESS,
                                      username);

        if (context->user) {
                r = chown_terminal(STDIN_FILENO, uid);
                if (r < 0) {
                        *exit_status = EXIT_STDIN;
                        return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
                }
        }

        /* If delegation is enabled we'll pass ownership of the cgroup
         * (but only in systemd's own controller hierarchy!) to the
         * user of the new process. */
        if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
                r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid);
                if (r < 0) {
                        *exit_status = EXIT_CGROUP;
                        return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
                }

                r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0755, uid, gid);
                if (r < 0) {
                        *exit_status = EXIT_CGROUP;
                        return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
                }
        }

        for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
                r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
                if (r < 0)
                        return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
        }

        r = build_environment(
                        unit,
                        context,
                        params,
                        n_fds,
                        home,
                        username,
                        shell,
                        journal_stream_dev,
                        journal_stream_ino,
                        &our_env);
        if (r < 0) {
                *exit_status = EXIT_MEMORY;
                return log_oom();
        }

        r = build_pass_environment(context, &pass_env);
        if (r < 0) {
                *exit_status = EXIT_MEMORY;
                return log_oom();
        }

        accum_env = strv_env_merge(5,
                                   params->environment,
                                   our_env,
                                   pass_env,
                                   context->environment,
                                   files_env,
                                   NULL);
        if (!accum_env) {
                *exit_status = EXIT_MEMORY;
                return log_oom();
        }
        accum_env = strv_env_clean(accum_env);

        (void) umask(context->umask);

        r = setup_keyring(unit, context, params, uid, gid);
        if (r < 0) {
                *exit_status = EXIT_KEYRING;
                return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
        }

        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);

        /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
        needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();

        /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
        if (needs_ambient_hack)
                needs_setuid = false;
        else
                needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));

        if (needs_sandboxing) {
                /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
                 * present. The actual MAC context application will happen later, as late as possible, to avoid
                 * impacting our own code paths. */

#if HAVE_SELINUX
                use_selinux = mac_selinux_use();
#endif
#if ENABLE_SMACK
                use_smack = mac_smack_use();
#endif
#if HAVE_APPARMOR
                use_apparmor = mac_apparmor_use();
#endif
        }

        if (needs_setuid) {
                if (context->pam_name && username) {
                        r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
                        if (r < 0) {
                                *exit_status = EXIT_PAM;
                                return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
                        }
                }
        }

        if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
                if (ns_type_supported(NAMESPACE_NET)) {
                        r = setup_netns(runtime->netns_storage_socket);
                        if (r < 0) {
                                *exit_status = EXIT_NETWORK;
                                return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
                        }
                } else
                        log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
        }

        needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
        if (needs_mount_namespace) {
                r = apply_mount_namespace(unit, command, context, params, runtime);
                if (r < 0) {
                        *exit_status = EXIT_NAMESPACE;
                        return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
                }
        }

        /* Apply just after mount namespace setup */
        r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
        if (r < 0)
                return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");

        /* Drop groups as early as possbile */
        if (needs_setuid) {
                r = enforce_groups(gid, supplementary_gids, ngids);
                if (r < 0) {
                        *exit_status = EXIT_GROUP;
                        return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
                }
        }

        if (needs_sandboxing) {
#if HAVE_SELINUX
                if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
                        r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
                        if (r < 0) {
                                *exit_status = EXIT_SELINUX_CONTEXT;
                                return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
                        }
                }
#endif

                if (context->private_users) {
                        r = setup_private_users(uid, gid);
                        if (r < 0) {
                                *exit_status = EXIT_USER;
                                return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
                        }
                }
        }

        /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
         * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
         * was needed to upload the policy and can now be closed as well. */
        r = close_all_fds(fds, n_fds);
        if (r >= 0)
                r = shift_fds(fds, n_fds);
        if (r >= 0)
                r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
        if (r < 0) {
                *exit_status = EXIT_FDS;
                return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
        }

        secure_bits = context->secure_bits;

        if (needs_sandboxing) {
                uint64_t bset;

                for (i = 0; i < _RLIMIT_MAX; i++) {

                        if (!context->rlimit[i])
                                continue;

                        r = setrlimit_closest(i, context->rlimit[i]);
                        if (r < 0) {
                                *exit_status = EXIT_LIMITS;
                                return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
                        }
                }

                /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
                if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
                        if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
                                *exit_status = EXIT_LIMITS;
                                return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
                        }
                }

                bset = context->capability_bounding_set;
                /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
                 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
                 * instead of us doing that */
                if (needs_ambient_hack)
                        bset |= (UINT64_C(1) << CAP_SETPCAP) |
                                (UINT64_C(1) << CAP_SETUID) |
                                (UINT64_C(1) << CAP_SETGID);

                if (!cap_test_all(bset)) {
                        r = capability_bounding_set_drop(bset, false);
                        if (r < 0) {
                                *exit_status = EXIT_CAPABILITIES;
                                return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
                        }
                }

                /* This is done before enforce_user, but ambient set
                 * does not survive over setresuid() if keep_caps is not set. */
                if (!needs_ambient_hack &&
                    context->capability_ambient_set != 0) {
                        r = capability_ambient_set_apply(context->capability_ambient_set, true);
                        if (r < 0) {
                                *exit_status = EXIT_CAPABILITIES;
                                return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
                        }
                }
        }

        if (needs_setuid) {
                if (context->user) {
                        r = enforce_user(context, uid);
                        if (r < 0) {
                                *exit_status = EXIT_USER;
                                return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
                        }

                        if (!needs_ambient_hack &&
                            context->capability_ambient_set != 0) {

                                /* Fix the ambient capabilities after user change. */
                                r = capability_ambient_set_apply(context->capability_ambient_set, false);
                                if (r < 0) {
                                        *exit_status = EXIT_CAPABILITIES;
                                        return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
                                }

                                /* If we were asked to change user and ambient capabilities
                                 * were requested, we had to add keep-caps to the securebits
                                 * so that we would maintain the inherited capability set
                                 * through the setresuid(). Make sure that the bit is added
                                 * also to the context secure_bits so that we don't try to
                                 * drop the bit away next. */

                                secure_bits |= 1<<SECURE_KEEP_CAPS;
                        }
                }
        }

        if (needs_sandboxing) {
                /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
                 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
                 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
                 * are restricted. */

#if HAVE_SELINUX
                if (use_selinux) {
                        char *exec_context = mac_selinux_context_net ?: context->selinux_context;

                        if (exec_context) {
                                r = setexeccon(exec_context);
                                if (r < 0) {
                                        *exit_status = EXIT_SELINUX_CONTEXT;
                                        return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
                                }
                        }
                }
#endif

#if ENABLE_SMACK
                if (use_smack) {
                        r = setup_smack(context, command);
                        if (r < 0) {
                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
                                return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
                        }
                }
#endif

#if HAVE_APPARMOR
                if (use_apparmor && context->apparmor_profile) {
                        r = aa_change_onexec(context->apparmor_profile);
                        if (r < 0 && !context->apparmor_profile_ignore) {
                                *exit_status = EXIT_APPARMOR_PROFILE;
                                return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
                        }
                }
#endif

                /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
                 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
                if (prctl(PR_GET_SECUREBITS) != secure_bits)
                        if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
                                *exit_status = EXIT_SECUREBITS;
                                return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
                        }

                if (context_has_no_new_privileges(context))
                        if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
                                *exit_status = EXIT_NO_NEW_PRIVILEGES;
                                return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
                        }

#if HAVE_SECCOMP
                r = apply_address_families(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_ADDRESS_FAMILIES;
                        return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
                }

                r = apply_memory_deny_write_execute(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
                }

                r = apply_restrict_realtime(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
                }

                r = apply_restrict_namespaces(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
                }

                r = apply_protect_sysctl(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
                }

                r = apply_protect_kernel_modules(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
                }

                r = apply_private_devices(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
                }

                r = apply_syscall_archs(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
                }

                r = apply_lock_personality(unit, context);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
                }

                /* This really should remain the last step before the execve(), to make sure our own code is unaffected
                 * by the filter as little as possible. */
                r = apply_syscall_filter(unit, context, needs_ambient_hack);
                if (r < 0) {
                        *exit_status = EXIT_SECCOMP;
                        return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
                }
#endif
        }

        if (!strv_isempty(context->unset_environment)) {
                char **ee = NULL;

                ee = strv_env_delete(accum_env, 1, context->unset_environment);
                if (!ee) {
                        *exit_status = EXIT_MEMORY;
                        return log_oom();
                }

                strv_free(accum_env);
                accum_env = ee;
        }

        final_argv = replace_env_argv(argv, accum_env);
        if (!final_argv) {
                *exit_status = EXIT_MEMORY;
                return log_oom();
        }

        if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
                _cleanup_free_ char *line;

                line = exec_command_line(final_argv);
                if (line) {
                        log_struct(LOG_DEBUG,
                                   "EXECUTABLE=%s", command->path,
                                   LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
                                   LOG_UNIT_ID(unit),
                                   LOG_UNIT_INVOCATION_ID(unit),
                                   NULL);
                }
        }

        execve(command->path, final_argv, accum_env);

        if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {

                log_struct_errno(LOG_INFO, errno,
                                 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
                                 LOG_UNIT_ID(unit),
                                 LOG_UNIT_INVOCATION_ID(unit),
                                 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
                                                  command->path),
                                 "EXECUTABLE=%s", command->path,
                                 NULL);

                return 0;
        }

        *exit_status = EXIT_EXEC;
        return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
}

int exec_spawn(Unit *unit,
               ExecCommand *command,
               const ExecContext *context,
               const ExecParameters *params,
               ExecRuntime *runtime,
               DynamicCreds *dcreds,
               pid_t *ret) {

        _cleanup_strv_free_ char **files_env = NULL;
        int *fds = NULL;
        unsigned n_storage_fds = 0, n_socket_fds = 0;
        _cleanup_free_ char *line = NULL;
        int socket_fd, r;
        int named_iofds[3] = { -1, -1, -1 };
        char **argv;
        pid_t pid;

        assert(unit);
        assert(command);
        assert(context);
        assert(ret);
        assert(params);
        assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));

        if (context->std_input == EXEC_INPUT_SOCKET ||
            context->std_output == EXEC_OUTPUT_SOCKET ||
            context->std_error == EXEC_OUTPUT_SOCKET) {

                if (params->n_socket_fds > 1) {
                        log_unit_error(unit, "Got more than one socket.");
                        return -EINVAL;
                }

                if (params->n_socket_fds == 0) {
                        log_unit_error(unit, "Got no socket.");
                        return -EINVAL;
                }

                socket_fd = params->fds[0];
        } else {
                socket_fd = -1;
                fds = params->fds;
                n_storage_fds = params->n_storage_fds;
                n_socket_fds = params->n_socket_fds;
        }

        r = exec_context_named_iofds(unit, context, params, named_iofds);
        if (r < 0)
                return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");

        r = exec_context_load_environment(unit, context, &files_env);
        if (r < 0)
                return log_unit_error_errno(unit, r, "Failed to load environment files: %m");

        argv = params->argv ?: command->argv;
        line = exec_command_line(argv);
        if (!line)
                return log_oom();

        log_struct(LOG_DEBUG,
                   LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
                   "EXECUTABLE=%s", command->path,
                   LOG_UNIT_ID(unit),
                   LOG_UNIT_INVOCATION_ID(unit),
                   NULL);

        pid = fork();
        if (pid < 0)
                return log_unit_error_errno(unit, errno, "Failed to fork: %m");

        if (pid == 0) {
                int exit_status = EXIT_SUCCESS;

                r = exec_child(unit,
                               command,
                               context,
                               params,
                               runtime,
                               dcreds,
                               argv,
                               socket_fd,
                               named_iofds,
                               fds,
                               n_storage_fds,
                               n_socket_fds,
                               files_env,
                               unit->manager->user_lookup_fds[1],
                               &exit_status);

                if (r < 0) {
                        log_struct_errno(LOG_ERR, r,
                                         "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
                                         LOG_UNIT_ID(unit),
                                         LOG_UNIT_INVOCATION_ID(unit),
                                         LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
                                                          exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
                                                          command->path),
                                         "EXECUTABLE=%s", command->path,
                                         NULL);
                }

                _exit(exit_status);
        }

        log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);

        /* We add the new process to the cgroup both in the child (so
         * that we can be sure that no user code is ever executed
         * outside of the cgroup) and in the parent (so that we can be
         * sure that when we kill the cgroup the process will be
         * killed too). */
        if (params->cgroup_path)
                (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);

        exec_status_start(&command->exec_status, pid);

        *ret = pid;
        return 0;
}

void exec_context_init(ExecContext *c) {
        ExecDirectoryType i;

        assert(c);

        c->umask = 0022;
        c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
        c->cpu_sched_policy = SCHED_OTHER;
        c->syslog_priority = LOG_DAEMON|LOG_INFO;
        c->syslog_level_prefix = true;
        c->ignore_sigpipe = true;
        c->timer_slack_nsec = NSEC_INFINITY;
        c->personality = PERSONALITY_INVALID;
        for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
                c->directories[i].mode = 0755;
        c->capability_bounding_set = CAP_ALL;
        c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
}

void exec_context_done(ExecContext *c) {
        unsigned l;
        ExecDirectoryType i;

        assert(c);

        c->environment = strv_free(c->environment);
        c->environment_files = strv_free(c->environment_files);
        c->pass_environment = strv_free(c->pass_environment);
        c->unset_environment = strv_free(c->unset_environment);

        for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
                c->rlimit[l] = mfree(c->rlimit[l]);

        for (l = 0; l < 3; l++)
                c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);

        c->working_directory = mfree(c->working_directory);
        c->root_directory = mfree(c->root_directory);
        c->root_image = mfree(c->root_image);
        c->tty_path = mfree(c->tty_path);
        c->syslog_identifier = mfree(c->syslog_identifier);
        c->user = mfree(c->user);
        c->group = mfree(c->group);

        c->supplementary_groups = strv_free(c->supplementary_groups);

        c->pam_name = mfree(c->pam_name);

        c->read_only_paths = strv_free(c->read_only_paths);
        c->read_write_paths = strv_free(c->read_write_paths);
        c->inaccessible_paths = strv_free(c->inaccessible_paths);

        bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);

        if (c->cpuset)
                CPU_FREE(c->cpuset);

        c->utmp_id = mfree(c->utmp_id);
        c->selinux_context = mfree(c->selinux_context);
        c->apparmor_profile = mfree(c->apparmor_profile);
        c->smack_process_label = mfree(c->smack_process_label);

        c->syscall_filter = set_free(c->syscall_filter);
        c->syscall_archs = set_free(c->syscall_archs);
        c->address_families = set_free(c->address_families);

        for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
                c->directories[i].paths = strv_free(c->directories[i].paths);
}

int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
        char **i;

        assert(c);

        if (!runtime_prefix)
                return 0;

        STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
                _cleanup_free_ char *p;

                p = strjoin(runtime_prefix, "/", *i);
                if (!p)
                        return -ENOMEM;

                /* We execute this synchronously, since we need to be sure this is gone when we start the service
                 * next. */
                (void) rm_rf(p, REMOVE_ROOT);

                /* Also destroy any matching subdirectory below /private/. This is done to support DynamicUser=1
                 * setups. Note that we don't conditionalize here on that though, as the namespace is same way, and it
                 * makes us a bit more robust towards changing unit settings. Or to say this differently: in the worst
                 * case this is a NOP. */

                free(p);
                p = strjoin(runtime_prefix, "/private/", *i);
                if (!p)
                        return -ENOMEM;

                (void) rm_rf(p, REMOVE_ROOT);
        }

        return 0;
}

void exec_command_done(ExecCommand *c) {
        assert(c);

        c->path = mfree(c->path);

        c->argv = strv_free(c->argv);
}

void exec_command_done_array(ExecCommand *c, unsigned n) {
        unsigned i;

        for (i = 0; i < n; i++)
                exec_command_done(c+i);
}

ExecCommand* exec_command_free_list(ExecCommand *c) {
        ExecCommand *i;

        while ((i = c)) {
                LIST_REMOVE(command, c, i);
                exec_command_done(i);
                free(i);
        }

        return NULL;
}

void exec_command_free_array(ExecCommand **c, unsigned n) {
        unsigned i;

        for (i = 0; i < n; i++)
                c[i] = exec_command_free_list(c[i]);
}

typedef struct InvalidEnvInfo {
        Unit *unit;
        const char *path;
} InvalidEnvInfo;

static void invalid_env(const char *p, void *userdata) {
        InvalidEnvInfo *info = userdata;

        log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
}

const char* exec_context_fdname(const ExecContext *c, int fd_index) {
        assert(c);

        switch (fd_index) {
        case STDIN_FILENO:
                if (c->std_input != EXEC_INPUT_NAMED_FD)
                        return NULL;
                return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
        case STDOUT_FILENO:
                if (c->std_output != EXEC_OUTPUT_NAMED_FD)
                        return NULL;
                return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
        case STDERR_FILENO:
                if (c->std_error != EXEC_OUTPUT_NAMED_FD)
                        return NULL;
                return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
        default:
                return NULL;
        }
}

int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
        unsigned i, targets;
        const char* stdio_fdname[3];
        unsigned n_fds;

        assert(c);
        assert(p);

        targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
                  (c->std_output == EXEC_OUTPUT_NAMED_FD) +
                  (c->std_error == EXEC_OUTPUT_NAMED_FD);

        for (i = 0; i < 3; i++)
                stdio_fdname[i] = exec_context_fdname(c, i);

        n_fds = p->n_storage_fds + p->n_socket_fds;

        for (i = 0; i < n_fds  && targets > 0; i++)
                if (named_iofds[STDIN_FILENO] < 0 &&
                    c->std_input == EXEC_INPUT_NAMED_FD &&
                    stdio_fdname[STDIN_FILENO] &&
                    streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {

                        named_iofds[STDIN_FILENO] = p->fds[i];
                        targets--;

                } else if (named_iofds[STDOUT_FILENO] < 0 &&
                           c->std_output == EXEC_OUTPUT_NAMED_FD &&
                           stdio_fdname[STDOUT_FILENO] &&
                           streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {

                        named_iofds[STDOUT_FILENO] = p->fds[i];
                        targets--;

                } else if (named_iofds[STDERR_FILENO] < 0 &&
                           c->std_error == EXEC_OUTPUT_NAMED_FD &&
                           stdio_fdname[STDERR_FILENO] &&
                           streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {

                        named_iofds[STDERR_FILENO] = p->fds[i];
                        targets--;
                }

        return targets == 0 ? 0 : -ENOENT;
}

int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
        char **i, **r = NULL;

        assert(c);
        assert(l);

        STRV_FOREACH(i, c->environment_files) {
                char *fn;
                int k;
                unsigned n;
                bool ignore = false;
                char **p;
                _cleanup_globfree_ glob_t pglob = {};

                fn = *i;

                if (fn[0] == '-') {
                        ignore = true;
                        fn++;
                }

                if (!path_is_absolute(fn)) {
                        if (ignore)
                                continue;

                        strv_free(r);
                        return -EINVAL;
                }

                /* Filename supports globbing, take all matching files */
                k = safe_glob(fn, 0, &pglob);
                if (k < 0) {
                        if (ignore)
                                continue;

                        strv_free(r);
                        return k;
                }

                /* When we don't match anything, -ENOENT should be returned */
                assert(pglob.gl_pathc > 0);

                for (n = 0; n < pglob.gl_pathc; n++) {
                        k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
                        if (k < 0) {
                                if (ignore)
                                        continue;

                                strv_free(r);
                                return k;
                        }
                        /* Log invalid environment variables with filename */
                        if (p) {
                                InvalidEnvInfo info = {
                                        .unit = unit,
                                        .path = pglob.gl_pathv[n]
                                };

                                p = strv_env_clean_with_callback(p, invalid_env, &info);
                        }

                        if (r == NULL)
                                r = p;
                        else {
                                char **m;

                                m = strv_env_merge(2, r, p);
                                strv_free(r);
                                strv_free(p);
                                if (!m)
                                        return -ENOMEM;

                                r = m;
                        }
                }
        }

        *l = r;

        return 0;
}

static bool tty_may_match_dev_console(const char *tty) {
        _cleanup_free_ char *active = NULL;
        char *console;

        if (!tty)
                return true;

        tty = skip_dev_prefix(tty);

        /* trivial identity? */
        if (streq(tty, "console"))
                return true;

        console = resolve_dev_console(&active);
        /* if we could not resolve, assume it may */
        if (!console)
                return true;

        /* "tty0" means the active VC, so it may be the same sometimes */
        return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
}

bool exec_context_may_touch_console(ExecContext *ec) {

        return (ec->tty_reset ||
                ec->tty_vhangup ||
                ec->tty_vt_disallocate ||
                is_terminal_input(ec->std_input) ||
                is_terminal_output(ec->std_output) ||
                is_terminal_output(ec->std_error)) &&
               tty_may_match_dev_console(exec_context_tty_path(ec));
}

static void strv_fprintf(FILE *f, char **l) {
        char **g;

        assert(f);

        STRV_FOREACH(g, l)
                fprintf(f, " %s", *g);
}

void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
        char **e, **d;
        unsigned i;
        ExecDirectoryType dt;
        int r;

        assert(c);
        assert(f);

        prefix = strempty(prefix);

        fprintf(f,
                "%sUMask: %04o\n"
                "%sWorkingDirectory: %s\n"
                "%sRootDirectory: %s\n"
                "%sNonBlocking: %s\n"
                "%sPrivateTmp: %s\n"
                "%sPrivateDevices: %s\n"
                "%sProtectKernelTunables: %s\n"
                "%sProtectKernelModules: %s\n"
                "%sProtectControlGroups: %s\n"
                "%sPrivateNetwork: %s\n"
                "%sPrivateUsers: %s\n"
                "%sProtectHome: %s\n"
                "%sProtectSystem: %s\n"
                "%sMountAPIVFS: %s\n"
                "%sIgnoreSIGPIPE: %s\n"
                "%sMemoryDenyWriteExecute: %s\n"
                "%sRestrictRealtime: %s\n"
                "%sKeyringMode: %s\n",
                prefix, c->umask,
                prefix, c->working_directory ? c->working_directory : "/",
                prefix, c->root_directory ? c->root_directory : "/",
                prefix, yes_no(c->non_blocking),
                prefix, yes_no(c->private_tmp),
                prefix, yes_no(c->private_devices),
                prefix, yes_no(c->protect_kernel_tunables),
                prefix, yes_no(c->protect_kernel_modules),
                prefix, yes_no(c->protect_control_groups),
                prefix, yes_no(c->private_network),
                prefix, yes_no(c->private_users),
                prefix, protect_home_to_string(c->protect_home),
                prefix, protect_system_to_string(c->protect_system),
                prefix, yes_no(c->mount_apivfs),
                prefix, yes_no(c->ignore_sigpipe),
                prefix, yes_no(c->memory_deny_write_execute),
                prefix, yes_no(c->restrict_realtime),
                prefix, exec_keyring_mode_to_string(c->keyring_mode));

        if (c->root_image)
                fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);

        STRV_FOREACH(e, c->environment)
                fprintf(f, "%sEnvironment: %s\n", prefix, *e);

        STRV_FOREACH(e, c->environment_files)
                fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);

        STRV_FOREACH(e, c->pass_environment)
                fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);

        STRV_FOREACH(e, c->unset_environment)
                fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);

        fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));

        for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
                fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);

                STRV_FOREACH(d, c->directories[dt].paths)
                        fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
        }

        if (c->nice_set)
                fprintf(f,
                        "%sNice: %i\n",
                        prefix, c->nice);

        if (c->oom_score_adjust_set)
                fprintf(f,
                        "%sOOMScoreAdjust: %i\n",
                        prefix, c->oom_score_adjust);

        for (i = 0; i < RLIM_NLIMITS; i++)
                if (c->rlimit[i]) {
                        fprintf(f, "%s%s: " RLIM_FMT "\n",
                                prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
                        fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
                                prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
                }

        if (c->ioprio_set) {
                _cleanup_free_ char *class_str = NULL;

                r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
                if (r >= 0)
                        fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);

                fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
        }

        if (c->cpu_sched_set) {
                _cleanup_free_ char *policy_str = NULL;

                r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
                if (r >= 0)
                        fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);

                fprintf(f,
                        "%sCPUSchedulingPriority: %i\n"
                        "%sCPUSchedulingResetOnFork: %s\n",
                        prefix, c->cpu_sched_priority,
                        prefix, yes_no(c->cpu_sched_reset_on_fork));
        }

        if (c->cpuset) {
                fprintf(f, "%sCPUAffinity:", prefix);
                for (i = 0; i < c->cpuset_ncpus; i++)
                        if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
                                fprintf(f, " %u", i);
                fputs("\n", f);
        }

        if (c->timer_slack_nsec != NSEC_INFINITY)
                fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);

        fprintf(f,
                "%sStandardInput: %s\n"
                "%sStandardOutput: %s\n"
                "%sStandardError: %s\n",
                prefix, exec_input_to_string(c->std_input),
                prefix, exec_output_to_string(c->std_output),
                prefix, exec_output_to_string(c->std_error));

        if (c->tty_path)
                fprintf(f,
                        "%sTTYPath: %s\n"
                        "%sTTYReset: %s\n"
                        "%sTTYVHangup: %s\n"
                        "%sTTYVTDisallocate: %s\n",
                        prefix, c->tty_path,
                        prefix, yes_no(c->tty_reset),
                        prefix, yes_no(c->tty_vhangup),
                        prefix, yes_no(c->tty_vt_disallocate));

        if (IN_SET(c->std_output,
                   EXEC_OUTPUT_SYSLOG,
                   EXEC_OUTPUT_KMSG,
                   EXEC_OUTPUT_JOURNAL,
                   EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
                   EXEC_OUTPUT_KMSG_AND_CONSOLE,
                   EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
            IN_SET(c->std_error,
                   EXEC_OUTPUT_SYSLOG,
                   EXEC_OUTPUT_KMSG,
                   EXEC_OUTPUT_JOURNAL,
                   EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
                   EXEC_OUTPUT_KMSG_AND_CONSOLE,
                   EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {

                _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;

                r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
                if (r >= 0)
                        fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);

                r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
                if (r >= 0)
                        fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
        }

        if (c->secure_bits) {
                _cleanup_free_ char *str = NULL;

                r = secure_bits_to_string_alloc(c->secure_bits, &str);
                if (r >= 0)
                        fprintf(f, "%sSecure Bits: %s\n", prefix, str);
        }

        if (c->capability_bounding_set != CAP_ALL) {
                _cleanup_free_ char *str = NULL;

                r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
                if (r >= 0)
                        fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
        }

        if (c->capability_ambient_set != 0) {
                _cleanup_free_ char *str = NULL;

                r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
                if (r >= 0)
                        fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
        }

        if (c->user)
                fprintf(f, "%sUser: %s\n", prefix, c->user);
        if (c->group)
                fprintf(f, "%sGroup: %s\n", prefix, c->group);

        fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));

        if (!strv_isempty(c->supplementary_groups)) {
                fprintf(f, "%sSupplementaryGroups:", prefix);
                strv_fprintf(f, c->supplementary_groups);
                fputs("\n", f);
        }

        if (c->pam_name)
                fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);

        if (strv_length(c->read_write_paths) > 0) {
                fprintf(f, "%sReadWritePaths:", prefix);
                strv_fprintf(f, c->read_write_paths);
                fputs("\n", f);
        }

        if (strv_length(c->read_only_paths) > 0) {
                fprintf(f, "%sReadOnlyPaths:", prefix);
                strv_fprintf(f, c->read_only_paths);
                fputs("\n", f);
        }

        if (strv_length(c->inaccessible_paths) > 0) {
                fprintf(f, "%sInaccessiblePaths:", prefix);
                strv_fprintf(f, c->inaccessible_paths);
                fputs("\n", f);
        }

        if (c->n_bind_mounts > 0)
                for (i = 0; i < c->n_bind_mounts; i++) {
                        fprintf(f, "%s%s: %s:%s:%s\n", prefix,
                                c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
                                c->bind_mounts[i].source,
                                c->bind_mounts[i].destination,
                                c->bind_mounts[i].recursive ? "rbind" : "norbind");
                }

        if (c->utmp_id)
                fprintf(f,
                        "%sUtmpIdentifier: %s\n",
                        prefix, c->utmp_id);

        if (c->selinux_context)
                fprintf(f,
                        "%sSELinuxContext: %s%s\n",
                        prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);

        if (c->apparmor_profile)
                fprintf(f,
                        "%sAppArmorProfile: %s%s\n",
                        prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);

        if (c->smack_process_label)
                fprintf(f,
                        "%sSmackProcessLabel: %s%s\n",
                        prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);

        if (c->personality != PERSONALITY_INVALID)
                fprintf(f,
                        "%sPersonality: %s\n",
                        prefix, strna(personality_to_string(c->personality)));

        fprintf(f,
                "%sLockPersonality: %s\n",
                prefix, yes_no(c->lock_personality));

        if (c->syscall_filter) {
#if HAVE_SECCOMP
                Iterator j;
                void *id;
                bool first = true;
#endif

                fprintf(f,
                        "%sSystemCallFilter: ",
                        prefix);

                if (!c->syscall_whitelist)
                        fputc('~', f);

#if HAVE_SECCOMP
                SET_FOREACH(id, c->syscall_filter, j) {
                        _cleanup_free_ char *name = NULL;

                        if (first)
                                first = false;
                        else
                                fputc(' ', f);

                        name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
                        fputs(strna(name), f);
                }
#endif

                fputc('\n', f);
        }

        if (c->syscall_archs) {
#if HAVE_SECCOMP
                Iterator j;
                void *id;
#endif

                fprintf(f,
                        "%sSystemCallArchitectures:",
                        prefix);

#if HAVE_SECCOMP
                SET_FOREACH(id, c->syscall_archs, j)
                        fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
#endif
                fputc('\n', f);
        }

        if (exec_context_restrict_namespaces_set(c)) {
                _cleanup_free_ char *s = NULL;

                r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
                if (r >= 0)
                        fprintf(f, "%sRestrictNamespaces: %s\n",
                                prefix, s);
        }

        if (c->syscall_errno > 0)
                fprintf(f,
                        "%sSystemCallErrorNumber: %s\n",
                        prefix, strna(errno_to_name(c->syscall_errno)));

        if (c->apparmor_profile)
                fprintf(f,
                        "%sAppArmorProfile: %s%s\n",
                        prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
}

bool exec_context_maintains_privileges(ExecContext *c) {
        assert(c);

        /* Returns true if the process forked off would run under
         * an unchanged UID or as root. */

        if (!c->user)
                return true;

        if (streq(c->user, "root") || streq(c->user, "0"))
                return true;

        return false;
}

int exec_context_get_effective_ioprio(ExecContext *c) {
        int p;

        assert(c);

        if (c->ioprio_set)
                return c->ioprio;

        p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
        if (p < 0)
                return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);

        return p;
}

void exec_status_start(ExecStatus *s, pid_t pid) {
        assert(s);

        zero(*s);
        s->pid = pid;
        dual_timestamp_get(&s->start_timestamp);
}

void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
        assert(s);

        if (s->pid && s->pid != pid)
                zero(*s);

        s->pid = pid;
        dual_timestamp_get(&s->exit_timestamp);

        s->code = code;
        s->status = status;

        if (context) {
                if (context->utmp_id)
                        utmp_put_dead_process(context->utmp_id, pid, code, status);

                exec_context_tty_reset(context, NULL);
        }
}

void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
        char buf[FORMAT_TIMESTAMP_MAX];

        assert(s);
        assert(f);

        if (s->pid <= 0)
                return;

        prefix = strempty(prefix);

        fprintf(f,
                "%sPID: "PID_FMT"\n",
                prefix, s->pid);

        if (dual_timestamp_is_set(&s->start_timestamp))
                fprintf(f,
                        "%sStart Timestamp: %s\n",
                        prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));

        if (dual_timestamp_is_set(&s->exit_timestamp))
                fprintf(f,
                        "%sExit Timestamp: %s\n"
                        "%sExit Code: %s\n"
                        "%sExit Status: %i\n",
                        prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
                        prefix, sigchld_code_to_string(s->code),
                        prefix, s->status);
}

char *exec_command_line(char **argv) {
        size_t k;
        char *n, *p, **a;
        bool first = true;

        assert(argv);

        k = 1;
        STRV_FOREACH(a, argv)
                k += strlen(*a)+3;

        n = new(char, k);
        if (!n)
                return NULL;

        p = n;
        STRV_FOREACH(a, argv) {

                if (!first)
                        *(p++) = ' ';
                else
                        first = false;

                if (strpbrk(*a, WHITESPACE)) {
                        *(p++) = '\'';
                        p = stpcpy(p, *a);
                        *(p++) = '\'';
                } else
                        p = stpcpy(p, *a);

        }

        *p = 0;

        /* FIXME: this doesn't really handle arguments that have
         * spaces and ticks in them */

        return n;
}

void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
        _cleanup_free_ char *cmd = NULL;
        const char *prefix2;

        assert(c);
        assert(f);

        prefix = strempty(prefix);
        prefix2 = strjoina(prefix, "\t");

        cmd = exec_command_line(c->argv);
        fprintf(f,
                "%sCommand Line: %s\n",
                prefix, cmd ? cmd : strerror(ENOMEM));

        exec_status_dump(&c->exec_status, f, prefix2);
}

void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
        assert(f);

        prefix = strempty(prefix);

        LIST_FOREACH(command, c, c)
                exec_command_dump(c, f, prefix);
}

void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
        ExecCommand *end;

        assert(l);
        assert(e);

        if (*l) {
                /* It's kind of important, that we keep the order here */
                LIST_FIND_TAIL(command, *l, end);
                LIST_INSERT_AFTER(command, *l, end, e);
        } else
              *l = e;
}

int exec_command_set(ExecCommand *c, const char *path, ...) {
        va_list ap;
        char **l, *p;

        assert(c);
        assert(path);

        va_start(ap, path);
        l = strv_new_ap(path, ap);
        va_end(ap);

        if (!l)
                return -ENOMEM;

        p = strdup(path);
        if (!p) {
                strv_free(l);
                return -ENOMEM;
        }

        free(c->path);
        c->path = p;

        strv_free(c->argv);
        c->argv = l;

        return 0;
}

int exec_command_append(ExecCommand *c, const char *path, ...) {
        _cleanup_strv_free_ char **l = NULL;
        va_list ap;
        int r;

        assert(c);
        assert(path);

        va_start(ap, path);
        l = strv_new_ap(path, ap);
        va_end(ap);

        if (!l)
                return -ENOMEM;

        r = strv_extend_strv(&c->argv, l, false);
        if (r < 0)
                return r;

        return 0;
}


static int exec_runtime_allocate(ExecRuntime **rt) {

        if (*rt)
                return 0;

        *rt = new0(ExecRuntime, 1);
        if (!*rt)
                return -ENOMEM;

        (*rt)->n_ref = 1;
        (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;

        return 0;
}

int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
        int r;

        assert(rt);
        assert(c);
        assert(id);

        if (*rt)
                return 1;

        if (!c->private_network && !c->private_tmp)
                return 0;

        r = exec_runtime_allocate(rt);
        if (r < 0)
                return r;

        if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
                if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
                        return -errno;
        }

        if (c->private_tmp && !(*rt)->tmp_dir) {
                r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
                if (r < 0)
                        return r;
        }

        return 1;
}

ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
        assert(r);
        assert(r->n_ref > 0);

        r->n_ref++;
        return r;
}

ExecRuntime *exec_runtime_unref(ExecRuntime *r) {

        if (!r)
                return NULL;

        assert(r->n_ref > 0);

        r->n_ref--;
        if (r->n_ref > 0)
                return NULL;

        free(r->tmp_dir);
        free(r->var_tmp_dir);
        safe_close_pair(r->netns_storage_socket);
        return mfree(r);
}

int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
        assert(u);
        assert(f);
        assert(fds);

        if (!rt)
                return 0;

        if (rt->tmp_dir)
                unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);

        if (rt->var_tmp_dir)
                unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);

        if (rt->netns_storage_socket[0] >= 0) {
                int copy;

                copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
                if (copy < 0)
                        return copy;

                unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
        }

        if (rt->netns_storage_socket[1] >= 0) {
                int copy;

                copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
                if (copy < 0)
                        return copy;

                unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
        }

        return 0;
}

int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
        int r;

        assert(rt);
        assert(key);
        assert(value);

        if (streq(key, "tmp-dir")) {
                char *copy;

                r = exec_runtime_allocate(rt);
                if (r < 0)
                        return log_oom();

                copy = strdup(value);
                if (!copy)
                        return log_oom();

                free((*rt)->tmp_dir);
                (*rt)->tmp_dir = copy;

        } else if (streq(key, "var-tmp-dir")) {
                char *copy;

                r = exec_runtime_allocate(rt);
                if (r < 0)
                        return log_oom();

                copy = strdup(value);
                if (!copy)
                        return log_oom();

                free((*rt)->var_tmp_dir);
                (*rt)->var_tmp_dir = copy;

        } else if (streq(key, "netns-socket-0")) {
                int fd;

                r = exec_runtime_allocate(rt);
                if (r < 0)
                        return log_oom();

                if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
                        log_unit_debug(u, "Failed to parse netns socket value: %s", value);
                else {
                        safe_close((*rt)->netns_storage_socket[0]);
                        (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
                }
        } else if (streq(key, "netns-socket-1")) {
                int fd;

                r = exec_runtime_allocate(rt);
                if (r < 0)
                        return log_oom();

                if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
                        log_unit_debug(u, "Failed to parse netns socket value: %s", value);
                else {
                        safe_close((*rt)->netns_storage_socket[1]);
                        (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
                }
        } else
                return 0;

        return 1;
}

static void *remove_tmpdir_thread(void *p) {
        _cleanup_free_ char *path = p;

        (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
        return NULL;
}

void exec_runtime_destroy(ExecRuntime *rt) {
        int r;

        if (!rt)
                return;

        /* If there are multiple users of this, let's leave the stuff around */
        if (rt->n_ref > 1)
                return;

        if (rt->tmp_dir) {
                log_debug("Spawning thread to nuke %s", rt->tmp_dir);

                r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
                if (r < 0) {
                        log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
                        free(rt->tmp_dir);
                }

                rt->tmp_dir = NULL;
        }

        if (rt->var_tmp_dir) {
                log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);

                r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
                if (r < 0) {
                        log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
                        free(rt->var_tmp_dir);
                }

                rt->var_tmp_dir = NULL;
        }

        safe_close_pair(rt->netns_storage_socket);
}

static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
        [EXEC_INPUT_NULL] = "null",
        [EXEC_INPUT_TTY] = "tty",
        [EXEC_INPUT_TTY_FORCE] = "tty-force",
        [EXEC_INPUT_TTY_FAIL] = "tty-fail",
        [EXEC_INPUT_SOCKET] = "socket",
        [EXEC_INPUT_NAMED_FD] = "fd",
};

DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);

static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
        [EXEC_OUTPUT_INHERIT] = "inherit",
        [EXEC_OUTPUT_NULL] = "null",
        [EXEC_OUTPUT_TTY] = "tty",
        [EXEC_OUTPUT_SYSLOG] = "syslog",
        [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
        [EXEC_OUTPUT_KMSG] = "kmsg",
        [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
        [EXEC_OUTPUT_JOURNAL] = "journal",
        [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
        [EXEC_OUTPUT_SOCKET] = "socket",
        [EXEC_OUTPUT_NAMED_FD] = "fd",
};

DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);

static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
        [EXEC_UTMP_INIT] = "init",
        [EXEC_UTMP_LOGIN] = "login",
        [EXEC_UTMP_USER] = "user",
};

DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);

static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
        [EXEC_PRESERVE_NO] = "no",
        [EXEC_PRESERVE_YES] = "yes",
        [EXEC_PRESERVE_RESTART] = "restart",
};

DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);

static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
        [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
        [EXEC_DIRECTORY_STATE] = "StateDirectory",
        [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
        [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
        [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
};

DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);

static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
        [EXEC_KEYRING_INHERIT] = "inherit",
        [EXEC_KEYRING_PRIVATE] = "private",
        [EXEC_KEYRING_SHARED] = "shared",
};

DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);
-												license: add GPLv2+ license blurbs everwhere

											
										
										
											2010-02-03 13:03:47 +01:00
+								/***
 								  This file is part of systemd.
 								  Copyright 2010 Lennart Poettering
 								  systemd is free software; you can redistribute it and/or modify it
-												relicense to LGPLv2.1 (with exceptions)

We finally got the OK from all contributors with non-trivial commits to
relicense systemd from GPL2+ to LGPL2.1+.

Some udev bits continue to be GPL2+ for now, but we are looking into
relicensing them too, to allow free copy/paste of all code within
systemd.

The bits that used to be MIT continue to be MIT.

The big benefit of the relicensing is that closed source code may now
link against libsystemd-login.so and friends.

											
										
										
											2012-04-12 00:20:58 +02:00
+								  under the terms of the GNU Lesser General Public License as published by
 								  the Free Software Foundation; either version 2.1 of the License, or
-												license: add GPLv2+ license blurbs everwhere

											
										
										
											2010-02-03 13:03:47 +01:00
+								  (at your option) any later version.
 								  systemd is distributed in the hope that it will be useful, but
 								  WITHOUT ANY WARRANTY; without even the implied warranty of
 								  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-												relicense to LGPLv2.1 (with exceptions)

We finally got the OK from all contributors with non-trivial commits to
relicense systemd from GPL2+ to LGPL2.1+.

Some udev bits continue to be GPL2+ for now, but we are looking into
relicensing them too, to allow free copy/paste of all code within
systemd.

The bits that used to be MIT continue to be MIT.

The big benefit of the relicensing is that closed source code may now
link against libsystemd-login.so and friends.

											
										
										
											2012-04-12 00:20:58 +02:00
+								  Lesser General Public License for more details.
-												license: add GPLv2+ license blurbs everwhere

											
										
										
											2010-02-03 13:03:47 +01:00
-												relicense to LGPLv2.1 (with exceptions)

We finally got the OK from all contributors with non-trivial commits to
relicense systemd from GPL2+ to LGPL2.1+.

Some udev bits continue to be GPL2+ for now, but we are looking into
relicensing them too, to allow free copy/paste of all code within
systemd.

The bits that used to be MIT continue to be MIT.

The big benefit of the relicensing is that closed source code may now
link against libsystemd-login.so and friends.

											
										
										
											2012-04-12 00:20:58 +02:00
+								  You should have received a copy of the GNU Lesser General Public License
-												license: add GPLv2+ license blurbs everwhere

											
										
										
											2010-02-03 13:03:47 +01:00
+								  along with systemd; If not, see <http://www.gnu.org/licenses/>.
 								***/
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								#include <errno.h>
 								#include <fcntl.h>
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include <glob.h>
 								#include <grp.h>
 								#include <poll.h>
-												reset signal mask when forking

											
										
										
											2010-01-27 06:17:51 +01:00
+								#include <signal.h>
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include <string.h>
-												core: set NoNewPrivileges for seccomp if we don't have CAP_SYS_ADMIN

The manpage of seccomp specify that using seccomp with
SECCOMP_SET_MODE_FILTER will return EACCES if the caller do not have
CAP_SYS_ADMIN set, or if the no_new_privileges bit is not set. Hence,
without NoNewPrivilege set, it is impossible to use a SystemCall*
directive with a User directive set in system mode.

Now, NoNewPrivileges is set if we are in user mode, or if we are in
system mode and we don't have CAP_SYS_ADMIN, and SystemCall*
directives are used.

											
										
										
											2016-01-30 17:26:39 +01:00
+								#include <sys/capability.h>
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								#include <sys/eventfd.h>
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
+								#include <sys/mman.h>
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include <sys/personality.h>
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								#include <sys/prctl.h>
-												seccomp: also block shmat(..., SHM_EXEC) for MemoryDenyWriteExecute

shmat(..., SHM_EXEC) can be used to create writable and executable
memory, so let's block it when MemoryDenyWriteExecute is set.

											
										
										
											2016-10-26 17:52:53 +02:00
+								#include <sys/shm.h>
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include <sys/socket.h>
-												execute: allow configuration of O_NONBLOCK flag from .service files

											
										
										
											2010-02-12 02:00:18 +01:00
+								#include <sys/stat.h>
-												seccomp: also block shmat(..., SHM_EXEC) for MemoryDenyWriteExecute

shmat(..., SHM_EXEC) can be used to create writable and executable
memory, so let's block it when MemoryDenyWriteExecute is set.

											
										
										
											2016-10-26 17:52:53 +02:00
+								#include <sys/types.h>
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include <sys/un.h>
 								#include <unistd.h>
-												core: optionally create LOGIN_PROCESS or USER_PROCESS utmp entries

When generating utmp/wtmp entries, optionally add both LOGIN_PROCESS and
INIT_PROCESS entries or even all three of LOGIN_PROCESS, INIT_PROCESS
and USER_PROCESS entries, instead of just a single INIT_PROCESS entry.

With this change systemd may be used to not only invoke a getty directly
in a SysV-compliant way but alternatively also a login(1) implementation
or even forego getty and login entirely, and invoke arbitrary shells in
a way that they appear in who(1) or w(1).

This is preparation for a later commit that adds a "machinectl shell"
operation to invoke a shell in a container, in a way that is compatible
with who(1) and w(1).

											
										
										
											2015-08-23 13:14:04 +02:00
+								#include <utmpx.h>
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_PAM
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								#include <security/pam_appl.h>
 								#endif
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SELINUX
-												exec: Add SELinuxContext configuration item

This permit to let system administrators decide of the domain of a service.
This can be used with templated units to have each service in a différent
domain ( for example, a per customer database, using MLS or anything ),
or can be used to force a non selinux enabled system (jvm, erlang, etc)
to start in a different domain for each service.

											
										
										
											2014-02-06 10:05:16 +01:00
+								#include <selinux/selinux.h>
 								#endif
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								#include <seccomp.h>
 								#endif
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_APPARMOR
-												core: Add AppArmor profile switching

This permit to switch to a specific apparmor profile when starting a daemon. This
will result in a non operation if apparmor is disabled.
It also add a new build requirement on libapparmor for using this feature.

											
										
										
											2014-02-20 16:19:44 +01:00
+								#include <sys/apparmor.h>
 								#endif
-												util: split out signal-util.[ch] from util.[ch]

No functional changes.

											
										
										
											2015-05-29 20:14:11 +02:00
+								#include "sd-messages.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
 								#include "af-list.h"
-												util-lib: split out allocation calls into alloc-util.[ch]

											
										
										
											2015-10-27 03:01:06 +01:00
+								#include "alloc-util.h"
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_APPARMOR
-												util-lib: split out fd-related operations into fd-util.[ch]

There are more than enough to deserve their own .c file, hence move them
over.

											
										
										
											2015-10-25 13:14:12 +01:00
+								#include "apparmor-util.h"
 								#endif
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "async.h"
 								#include "barrier.h"
 								#include "cap-list.h"
-												src/basic: rename audit.[ch] → audit-util.[ch] and capability.[ch] → capability-util.[ch]

The files are named too generically, so that they might conflict with
the upstream project headers. Hence, let's add a "-util" suffix, to
clarify that this are just our utility headers and not any official
upstream headers.

											
										
										
											2015-10-26 23:32:16 +01:00
+								#include "capability-util.h"
-												core: chown() StateDirectory= and friends recursively when starting a service

This is particularly useful when used in conjunction with DynamicUser=1,
where the UID might change for every invocation, but is useful in other
cases too, for example, when these directories are shared between
systems where the UID assignments differ slightly.

											
										
										
											2017-09-28 19:13:44 +02:00
+								#include "chown-recursive.h"
-												def: centralize definition of default timeout in one place

											
										
										
											2011-03-17 04:02:35 +01:00
+								#include "def.h"
-												env: considerably beef up environment cleaning logic

Now, actually check if the environment variable names and values used
are valid, before accepting them. With this in place are at some places
more rigid than POSIX, and less rigid at others. For example, this code
allows lower-case environment variables (which POSIX suggests not to
use), but it will not allow non-UTF8 variable values.

All in all this should be a good middle ground of what to allow and what
not to allow as environment variables.

(This also splits out all environment related calls into env-util.[ch])

											
										
										
											2013-02-11 03:46:08 +01:00
+								#include "env-util.h"
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								#include "errno-list.h"
-												util-lib: split out fd-related operations into fd-util.[ch]

There are more than enough to deserve their own .c file, hence move them
over.

											
										
										
											2015-10-25 13:14:12 +01:00
+								#include "execute.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "exit-status.h"
-												util-lib: split out fd-related operations into fd-util.[ch]

There are more than enough to deserve their own .c file, hence move them
over.

											
										
										
											2015-10-25 13:14:12 +01:00
+								#include "fd-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "fileio.h"
-												Rename formats-util.h to format-util.h

We don't have plural in the name of any other -util files and this
inconsistency trips me up every time I try to type this file name
from memory. "formats-util" is even hard to pronounce.

											
										
										
											2016-11-07 16:14:59 +01:00
+								#include "format-util.h"
-												util-lib: move a number of fs operations into fs-util.[ch]

											
										
										
											2015-10-26 21:16:26 +01:00
+								#include "fs-util.h"
-												util-lib: split out globbing related calls into glob-util.[ch]

											
										
										
											2015-10-27 01:48:17 +01:00
+								#include "glob-util.h"
-												util-lib: split out IO related calls to io-util.[ch]

											
										
										
											2015-10-25 14:08:25 +01:00
+								#include "io-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "ioprio.h"
-												core: chown() StateDirectory= and friends recursively when starting a service

This is particularly useful when used in conjunction with DynamicUser=1,
where the UID might change for every invocation, but is useful in other
cases too, for example, when these directories are shared between
systems where the UID assignments differ slightly.

											
										
										
											2017-09-28 19:13:44 +02:00
+								#include "label.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "log.h"
 								#include "macro.h"
 								#include "missing.h"
 								#include "mkdir.h"
 								#include "namespace.h"
-												util-lib: split string parsing related calls from util.[ch] into parse-util.[ch]

											
										
										
											2015-10-26 16:18:16 +01:00
+								#include "parse-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "path-util.h"
-												shared: add process-util.[ch]

											
										
										
											2015-04-10 19:10:00 +02:00
+								#include "process-util.h"
-												util-lib: split out resource limits related calls into rlimit-util.[ch]

											
										
										
											2015-10-26 19:40:43 +01:00
+								#include "rlimit-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "rm-rf.h"
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												util-lib: split out fd-related operations into fd-util.[ch]

There are more than enough to deserve their own .c file, hence move them
over.

											
										
										
											2015-10-25 13:14:12 +01:00
+								#include "seccomp-util.h"
 								#endif
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "securebits.h"
-												securebits-util: add secure_bits_{from_string,to_string_alloc}()

											
										
										
											2017-08-07 16:40:25 +02:00
+								#include "securebits-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "selinux-util.h"
-												util: split out signal-util.[ch] from util.[ch]

No functional changes.

											
										
										
											2015-05-29 20:14:11 +02:00
+								#include "signal-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "smack-util.h"
-												core: bypass dynamic user lookups from dbus-daemon

dbus-daemon does NSS name look-ups in order to enforce its bus policy. This
might dead-lock if an NSS module use wants to use D-Bus for the look-up itself,
like our nss-systemd does. Let's work around this by bypassing bus
communication in the NSS module if we run inside of dbus-daemon. To make this
work we keep a bit of extra state in /run/systemd/dynamic-uid/ so that we don't
have to consult the bus, but can still resolve the names.

Note that the normal codepath continues to be via the bus, so that resolving
works from all mount namespaces and is subject to authentication, as before.

This is a bit dirty, but not too dirty, as dbus daemon is kinda special anyway
for PID 1.

											
										
										
											2016-08-02 12:28:51 +02:00
+								#include "special.h"
-												util-lib: move string table stuff into its own string-table.[ch]

											
										
										
											2015-10-26 22:31:05 +01:00
+								#include "string-table.h"
-												util-lib: split our string related calls from util.[ch] into its own file string-util.[ch]

There are more than enough calls doing string manipulations to deserve
its own files, hence do something about it.

This patch also sorts the #include blocks of all files that needed to be
updated, according to the sorting suggestions from CODING_STYLE. Since
pretty much every file needs our string manipulation functions this
effectively means that most files have sorted #include blocks now.

Also touches a few unrelated include files.

											
										
										
											2015-10-24 22:58:24 +02:00
+								#include "string-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "strv.h"
-												util-lib: split out syslog-related calls into syslog-util.[ch]

											
										
										
											2015-10-27 00:40:25 +01:00
+								#include "syslog-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "terminal-util.h"
 								#include "unit.h"
-												util-lib: split out user/group/uid/gid calls into user-util.[ch]

											
										
										
											2015-10-25 22:32:30 +01:00
+								#include "user-util.h"
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								#include "util.h"
 								#include "utmp-wtmp.h"
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
-												service: for Type=idle units consider START_PRE, START, START_POST all as ACTIVE

We want to avoid a deadlock when a service has ExecStartPre= programs
that wait for the job queue to run empty because of Type=idle, but which
themselves keep the queue non-empty because START_PRE was considered
ACTIVATING and hence the job not complete. With this patch we alter the
state translation table so that it is impossible ever to wait for
Type=idle unit, hence removing the deadlock.

											
										
										
											2012-05-24 02:22:35 +02:00
+								#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
+								#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
-												execute: use a much lower idle timeout that default time

The idle timeout after all is for cosmetics only, hence avoid any
substantial delays just for it.

											
										
										
											2012-05-22 19:26:13 +02:00
-												execute: chown() the tty when running owning them

											
										
										
											2010-04-13 18:50:43 +02:00
+								/* This assumes there is a 'tty' group */
 								#define TTY_MODE 0620
-												execute: also set SO_SNDBUF when spawning a service with stdout/stderr connected to journald

											
										
										
											2013-12-16 20:00:09 +01:00
+								#define SNDBUF_SIZE (8*1024*1024)
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								static int shift_fds(int fds[], unsigned n_fds) {
 								        int start, restart_from;
 								        if (n_fds <= 0)
 								                return 0;
-												util: move close_all_fds() to util.c

											
										
										
											2010-04-06 23:35:59 +02:00
+								        /* Modifies the fds array! (sorts it) */
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        assert(fds);
 								        start = 0;
 								        for (;;) {
 								                int i;
 								                restart_from = -1;
 								                for (i = start; i < (int) n_fds; i++) {
 								                        int nfd;
 								                        /* Already at right index? */
 								                        if (fds[i] == i+3)
 								                                continue;
-												tree-wide: don't do assignments within if checks

Turn this:

       if ((r = foo()) < 0) { ...

into this:

       r = foo();
       if (r < 0) { ...

											
										
										
											2015-09-08 19:14:10 +02:00
+								                        nfd = fcntl(fds[i], F_DUPFD, i + 3);
 								                        if (nfd < 0)
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								                                return -errno;
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								                        safe_close(fds[i]);
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								                        fds[i] = nfd;
 								                        /* Hmm, the fd we wanted isn't free? Then
-												core: correct spacing near eol in code comments

											
										
										
											2014-12-10 20:00:08 +01:00
+								                         * let's remember that and try again from here */
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								                        if (nfd != i+3 && restart_from < 0)
 								                                restart_from = i;
 								                }
 								                if (restart_from < 0)
 								                        break;
 								                start = restart_from;
 								        }
 								        return 0;
 								}
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
 								        unsigned i, n_fds;
-												execute: use fd_nonblock()//fd_cloexec() where applicable

											
										
										
											2010-04-06 21:53:39 +02:00
+								        int r;
-												drop O_CLOEXEC/O_NONBLOCK from files intended for forked clients

											
										
										
											2010-01-27 06:18:45 +01:00
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								        n_fds = n_storage_fds + n_socket_fds;
-												drop O_CLOEXEC/O_NONBLOCK from files intended for forked clients

											
										
										
											2010-01-27 06:18:45 +01:00
+								        if (n_fds <= 0)
 								                return 0;
 								        assert(fds);
-												core: only apply NonBlocking= to fds passed via socket activation

Make sure to only apply the O_NONBLOCK flag to the fds passed via socket
activation.

Previously the flag was also applied to the fds which came from the fd store
but this was incorrect since services, after being restarted, expect that these
passed fds have their flags unchanged and can be reused as before.

The documentation was a bit unclear about this so clarify it.

											
										
										
											2017-05-12 11:32:53 +02:00
+								        /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 								         * O_NONBLOCK only applies to socket activation though. */
-												drop O_CLOEXEC/O_NONBLOCK from files intended for forked clients

											
										
										
											2010-01-27 06:18:45 +01:00
 								        for (i = 0; i < n_fds; i++) {
-												core: only apply NonBlocking= to fds passed via socket activation

Make sure to only apply the O_NONBLOCK flag to the fds passed via socket
activation.

Previously the flag was also applied to the fds which came from the fd store
but this was incorrect since services, after being restarted, expect that these
passed fds have their flags unchanged and can be reused as before.

The documentation was a bit unclear about this so clarify it.

											
										
										
											2017-05-12 11:32:53 +02:00
+								                if (i < n_socket_fds) {
 								                        r = fd_nonblock(fds[i], nonblock);
 								                        if (r < 0)
 								                                return r;
 								                }
-												drop O_CLOEXEC/O_NONBLOCK from files intended for forked clients

											
										
										
											2010-01-27 06:18:45 +01:00
-												execute: allow configuration of O_NONBLOCK flag from .service files

											
										
										
											2010-02-12 02:00:18 +01:00
+								                /* We unconditionally drop FD_CLOEXEC from the fds,
 								                 * since after all we want to pass these fds to our
 								                 * children */
-												drop O_CLOEXEC/O_NONBLOCK from files intended for forked clients

											
										
										
											2010-01-27 06:18:45 +01:00
-												tree-wide: don't do assignments within if checks

Turn this:

       if ((r = foo()) < 0) { ...

into this:

       r = foo();
       if (r < 0) { ...

											
										
										
											2015-09-08 19:14:10 +02:00
+								                r = fd_cloexec(fds[i], false);
 								                if (r < 0)
-												execute: use fd_nonblock()//fd_cloexec() where applicable

											
										
										
											2010-04-06 21:53:39 +02:00
+								                        return r;
-												drop O_CLOEXEC/O_NONBLOCK from files intended for forked clients

											
										
										
											2010-01-27 06:18:45 +01:00
+								        }
 								        return 0;
 								}
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								static const char *exec_context_tty_path(const ExecContext *context) {
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(context);
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								        if (context->stdio_as_fds)
 								                return NULL;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        if (context->tty_path)
 								                return context->tty_path;
 								        return "/dev/console";
 								}
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 								        const char *path;
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
+								        assert(context);
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								        path = exec_context_tty_path(context);
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								        if (context->tty_vhangup) {
 								                if (p && p->stdin_fd >= 0)
 								                        (void) terminal_vhangup_fd(p->stdin_fd);
 								                else if (path)
 								                        (void) terminal_vhangup(path);
 								        }
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								        if (context->tty_reset) {
 								                if (p && p->stdin_fd >= 0)
 								                        (void) reset_terminal_fd(p->stdin_fd, true);
 								                else if (path)
 								                        (void) reset_terminal(path);
 								        }
 								        if (context->tty_vt_disallocate && path)
 								                (void) vt_disallocate(path);
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
+								}
-												core: inherit TERM from PID 1 for all services started on /dev/console

This way, invoking nspawn from a shell in the best case inherits the TERM
setting all the way down into the login shell spawned in the container.

Fixes: #3697

											
										
										
											2016-07-27 15:25:55 +02:00
+								static bool is_terminal_input(ExecInput i) {
 								        return IN_SET(i,
 								                      EXEC_INPUT_TTY,
 								                      EXEC_INPUT_TTY_FORCE,
 								                      EXEC_INPUT_TTY_FAIL);
 								}
-												core/execute: add internal is_terminal_output()

Similar to already existing is_terminal_input().

Note that the only current user (connect_logger_as) is never called
for EXEC_OUTPUT_TTY, so it won't mind whether we accept it.

											
										
										
											2013-02-28 01:35:47 +01:00
+								static bool is_terminal_output(ExecOutput o) {
-												core: inherit TERM from PID 1 for all services started on /dev/console

This way, invoking nspawn from a shell in the best case inherits the TERM
setting all the way down into the login shell spawned in the container.

Fixes: #3697

											
										
										
											2016-07-27 15:25:55 +02:00
+								        return IN_SET(o,
 								                      EXEC_OUTPUT_TTY,
 								                      EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 								                      EXEC_OUTPUT_KMSG_AND_CONSOLE,
 								                      EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 								}
-												execute: minor ExecOutput handling beautification (#6711)

Let's clean up the checking for the various ExecOutput values a bit,
let's use IN_SET everywhere, and the same concepts for all three bools
we pass to dprintf().
											
										
										
											2017-09-01 02:04:27 +02:00
+								static bool is_syslog_output(ExecOutput o) {
 								        return IN_SET(o,
 								                      EXEC_OUTPUT_SYSLOG,
 								                      EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 								}
 								static bool is_kmsg_output(ExecOutput o) {
 								        return IN_SET(o,
 								                      EXEC_OUTPUT_KMSG,
 								                      EXEC_OUTPUT_KMSG_AND_CONSOLE);
 								}
-												core: inherit TERM from PID 1 for all services started on /dev/console

This way, invoking nspawn from a shell in the best case inherits the TERM
setting all the way down into the login shell spawned in the container.

Fixes: #3697

											
										
										
											2016-07-27 15:25:55 +02:00
+								static bool exec_context_needs_term(const ExecContext *c) {
 								        assert(c);
 								        /* Return true if the execution context suggests we should set $TERM to something useful. */
 								        if (is_terminal_input(c->std_input))
 								                return true;
 								        if (is_terminal_output(c->std_output))
 								                return true;
 								        if (is_terminal_output(c->std_error))
 								                return true;
 								        return !!c->tty_path;
-												core/execute: add internal is_terminal_output()

Similar to already existing is_terminal_input().

Note that the only current user (connect_logger_as) is never called
for EXEC_OUTPUT_TTY, so it won't mind whether we accept it.

											
										
										
											2013-02-28 01:35:47 +01:00
+								}
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								static int open_null_as(int flags, int nfd) {
 								        int fd, r;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(nfd >= 0);
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        fd = open("/dev/null", flags|O_NOCTTY);
 								        if (fd < 0)
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
+								                return -errno;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        if (fd != nfd) {
 								                r = dup2(fd, nfd) < 0 ? -errno : nfd;
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								                safe_close(fd);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        } else
 								                r = nfd;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        return r;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
+								}
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
-												execute: make some code shorter

Let's simplify some lines to make it shorter.

											
										
										
											2017-07-14 18:58:57 +02:00
+								        static const union sockaddr_union sa = {
-												Use initalization instead of explicit zeroing

Before, we would initialize many fields twice: first
by filling the structure with zeros, and then a second
time with the real values. We can let the compiler do
the job for us, avoiding one copy.

A downside of this patch is that text gets slightly
bigger. This is because all zero() calls are effectively
inlined:

$ size build/.libs/systemd
         text    data     bss     dec     hex filename
before 897737  107300    2560 1007597   f5fed build/.libs/systemd
after  897873  107300    2560 1007733   f6075 build/.libs/systemd

… actually less than 1‰.

A few asserts that the parameter is not null had to be removed. I
don't think this changes much, because first, it is quite unlikely
for the assert to fail, and second, an immediate SEGV is almost as
good as an assert.

											
										
										
											2013-03-25 00:59:00 +01:00
+								                .un.sun_family = AF_UNIX,
 								                .un.sun_path = "/run/systemd/journal/stdout",
 								        };
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								        uid_t olduid = UID_INVALID;
 								        gid_t oldgid = GID_INVALID;
 								        int r;
-												core, sd-bus, logind: make use of uid_is_valid() in more places

											
										
										
											2017-07-14 18:57:04 +02:00
+								        if (gid_is_valid(gid)) {
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                oldgid = getgid();
-												execute: make some code shorter

Let's simplify some lines to make it shorter.

											
										
										
											2017-07-14 18:58:57 +02:00
+								                if (setegid(gid) < 0)
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                        return -errno;
 								        }
-												core, sd-bus, logind: make use of uid_is_valid() in more places

											
										
										
											2017-07-14 18:57:04 +02:00
+								        if (uid_is_valid(uid)) {
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                olduid = getuid();
-												execute: make some code shorter

Let's simplify some lines to make it shorter.

											
										
										
											2017-07-14 18:58:57 +02:00
+								                if (seteuid(uid) < 0) {
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                        r = -errno;
 								                        goto restore_gid;
 								                }
 								        }
-												execute: make some code shorter

Let's simplify some lines to make it shorter.

											
										
										
											2017-07-14 18:58:57 +02:00
+								        r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
 								        /* If we fail to restore the uid or gid, things will likely
 								           fail later on. This should only happen if an LSM interferes. */
-												core, sd-bus, logind: make use of uid_is_valid() in more places

											
										
										
											2017-07-14 18:57:04 +02:00
+								        if (uid_is_valid(uid))
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                (void) seteuid(olduid);
 								 restore_gid:
-												core, sd-bus, logind: make use of uid_is_valid() in more places

											
										
										
											2017-07-14 18:57:04 +02:00
+								        if (gid_is_valid(gid))
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                (void) setegid(oldgid);
 								        return r;
 								}
-												execute: minor coding style improvements

											
										
										
											2016-06-14 16:50:35 +02:00
+								static int connect_logger_as(
-												execute: normalize connect_logger_as() parameters slightly

All other functions in execute.c that need the unit id take a Unit* parameter
as first argument. Let's change connect_logger_as() to follow a similar logic.

											
										
										
											2016-07-02 04:57:21 +02:00
+								                Unit *unit,
-												execute: minor coding style improvements

											
										
										
											2016-06-14 16:50:35 +02:00
+								                const ExecContext *context,
-												execute: let's decouple execute.c a bit from the unit logic

Let's try to decouple the execution engine a bit from the Unit/Manager
concept, and hence pass one more flag as part of the ExecParameters flags
field.

											
										
										
											2017-08-01 10:28:20 +02:00
+								                const ExecParameters *params,
-												execute: minor coding style improvements

											
										
										
											2016-06-14 16:50:35 +02:00
+								                ExecOutput output,
 								                const char *ident,
 								                int nfd,
 								                uid_t uid,
 								                gid_t gid) {
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								        int fd, r;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
 								        assert(context);
-												execute: let's decouple execute.c a bit from the unit logic

Let's try to decouple the execution engine a bit from the Unit/Manager
concept, and hence pass one more flag as part of the ExecParameters flags
field.

											
										
										
											2017-08-01 10:28:20 +02:00
+								        assert(params);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(output < _EXEC_OUTPUT_MAX);
 								        assert(ident);
 								        assert(nfd >= 0);
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												execute: talk directly to the journald, instead to the stdout-syslog-bridge

											
										
										
											2012-01-05 21:39:08 +01:00
+								        fd = socket(AF_UNIX, SOCK_STREAM, 0);
 								        if (fd < 0)
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                return -errno;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								        r = connect_journal_socket(fd, uid, gid);
 								        if (r < 0)
 								                return r;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        if (shutdown(fd, SHUT_RD) < 0) {
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								                safe_close(fd);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                return -errno;
 								        }
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												execute: minor coding style improvements

											
										
										
											2016-06-14 16:50:35 +02:00
+								        (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
-												execute: also set SO_SNDBUF when spawning a service with stdout/stderr connected to journald

											
										
										
											2013-12-16 20:00:09 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        dprintf(fd,
-												journal: set the _SYSTEMD_UNIT field for messages from terminated processes

As described in

  https://bugs.freedesktop.org/show_bug.cgi?id=50184

the journal currently doesn't set fields such as _SYSTEMD_UNIT
properly for messages coming from processes that have already
terminated.  This means among other things that "systemctl status" may
not show some of the output of services that wrote messages just
before they exited.

This patch fixes this by having processes that log to the journal
write their unit identifier to journald when the connection to
/run/systemd/journal/stdout is opened.  Journald stores the unit ID
and uses it to fill in _SYSTEMD_UNIT when it cannot be obtained
normally (i.e. from the cgroup).  To prevent impersonating another
unit, this information is only used when the caller is root.

This doesn't fix the general problem of getting metadata about
messages from terminated processes (which requires some kernel
support), but it allows "systemctl status" and similar queries to do
the Right Thing for units that log via stdout/stderr.

											
										
										
											2012-06-21 22:40:47 +02:00
+								                "%s\n"
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                "%s\n"
 								                "%i\n"
-												execute: talk directly to the journald, instead to the stdout-syslog-bridge

											
										
										
											2012-01-05 21:39:08 +01:00
+								                "%i\n"
 								                "%i\n"
 								                "%i\n"
-												logger: support printk() style priority prefixes

											
										
										
											2010-05-16 01:46:35 +02:00
+								                "%i\n",
-												execute: don't pass unit ID in --user mode to journald for stream logging

When we create a log stream connection to journald, we pass along the
unit ID. With this change we do this only when we run as system
instance, not as user instance, to remove the ambiguity whether a user
or system unit is specified. The effect of this change is minor:
journald ignores the field anyway from clients with UID != 0. This patch
hence only fixes the unit attribution for the --user instance of the
root user.

											
										
										
											2017-07-14 18:59:41 +02:00
+								                context->syslog_identifier ?: ident,
-												execute: let's decouple execute.c a bit from the unit logic

Let's try to decouple the execution engine a bit from the Unit/Manager
concept, and hence pass one more flag as part of the ExecParameters flags
field.

											
										
										
											2017-08-01 10:28:20 +02:00
+								                params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
-												execute: talk directly to the journald, instead to the stdout-syslog-bridge

											
										
										
											2012-01-05 21:39:08 +01:00
+								                context->syslog_priority,
 								                !!context->syslog_level_prefix,
-												execute: minor ExecOutput handling beautification (#6711)

Let's clean up the checking for the various ExecOutput values a bit,
let's use IN_SET everywhere, and the same concepts for all three bools
we pass to dprintf().
											
										
										
											2017-09-01 02:04:27 +02:00
+								                is_syslog_output(output),
 								                is_kmsg_output(output),
-												core/execute: add internal is_terminal_output()

Similar to already existing is_terminal_input().

Note that the only current user (connect_logger_as) is never called
for EXEC_OUTPUT_TTY, so it won't mind whether we accept it.

											
										
										
											2013-02-28 01:35:47 +01:00
+								                is_terminal_output(output));
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												execute: minor coding style improvements

											
										
										
											2016-06-14 16:50:35 +02:00
+								        if (fd == nfd)
 								                return nfd;
 								        r = dup2(fd, nfd) < 0 ? -errno : nfd;
 								        safe_close(fd);
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        return r;
 								}
 								static int open_terminal_as(const char *path, mode_t mode, int nfd) {
 								        int fd, r;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(path);
 								        assert(nfd >= 0);
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												tree-wide: don't do assignments within if checks

Turn this:

       if ((r = foo()) < 0) { ...

into this:

       r = foo();
       if (r < 0) { ...

											
										
										
											2015-09-08 19:14:10 +02:00
+								        fd = open_terminal(path, mode | O_NOCTTY);
 								        if (fd < 0)
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                return fd;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        if (fd != nfd) {
 								                r = dup2(fd, nfd) < 0 ? -errno : nfd;
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								                safe_close(fd);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        } else
 								                r = nfd;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        return r;
 								}
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												execute: if the main process of a service already owns the TTY, don't wait for acquiring it again in the reload/stop step

											
										
										
											2010-07-08 04:09:17 +02:00
+								static int fixup_input(ExecInput std_input, int socket_fd, bool apply_tty_stdin) {
 								        if (is_terminal_input(std_input) && !apply_tty_stdin)
 								                return EXEC_INPUT_NULL;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												execute: simplify stdin/stderr/stdout fixup a little

											
										
										
											2010-05-19 21:50:34 +02:00
+								        if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								                return EXEC_INPUT_NULL;
-												execute: simplify stdin/stderr/stdout fixup a little

											
										
										
											2010-05-19 21:50:34 +02:00
+								        return std_input;
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								}
-												execute: simplify stdin/stderr/stdout fixup a little

											
										
										
											2010-05-19 21:50:34 +02:00
+								static int fixup_output(ExecOutput std_output, int socket_fd) {
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
-												execute: simplify stdin/stderr/stdout fixup a little

											
										
										
											2010-05-19 21:50:34 +02:00
+								        if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								                return EXEC_OUTPUT_INHERIT;
-												execute: simplify stdin/stderr/stdout fixup a little

											
										
										
											2010-05-19 21:50:34 +02:00
+								        return std_output;
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								}
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								static int setup_input(
 								                const ExecContext *context,
 								                const ExecParameters *params,
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                int socket_fd,
 								                int named_iofds[3]) {
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								        ExecInput i;
 								        assert(context);
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								        assert(params);
 								        if (params->stdin_fd >= 0) {
 								                if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 								                        return -errno;
 								                /* Try to make this the controlling tty, if it is a tty, and reset it */
 								                (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 								                (void) reset_terminal_fd(STDIN_FILENO, true);
 								                return STDIN_FILENO;
 								        }
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
-												core: turn various execution flags into a proper flags parameter

The ExecParameters structure contains a number of bit-flags, that were so far
exposed as bool:1, change this to a proper, single binary bit flag field. This
makes things a bit more expressive, and is helpful as we add more flags, since
these booleans are passed around in various callers, for example
service_spawn(), whose signature can be made much shorter now.

Not all bit booleans from ExecParameters are moved into the flags field for
now, but this can be added later.

											
										
										
											2016-07-26 17:40:35 +02:00
+								        i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
 								        switch (i) {
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        case EXEC_INPUT_NULL:
 								                return open_null_as(O_RDONLY, STDIN_FILENO);
 								        case EXEC_INPUT_TTY:
 								        case EXEC_INPUT_TTY_FORCE:
 								        case EXEC_INPUT_TTY_FAIL: {
 								                int fd, r;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                fd = acquire_terminal(exec_context_tty_path(context),
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                                      i == EXEC_INPUT_TTY_FAIL,
 								                                      i == EXEC_INPUT_TTY_FORCE,
 								                                      false,
-												time-util: add and use USEC/NSEC_INFINIY

											
										
										
											2014-07-29 12:23:31 +02:00
+								                                      USEC_INFINITY);
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                if (fd < 0)
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                        return fd;
 								                if (fd != STDIN_FILENO) {
 								                        r = dup2(fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								                        safe_close(fd);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                } else
 								                        r = STDIN_FILENO;
 								                return r;
 								        }
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								        case EXEC_INPUT_SOCKET:
 								                return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        case EXEC_INPUT_NAMED_FD:
 								                (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 								                return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        default:
 								                assert_not_reached("Unknown input type");
 								        }
 								}
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								static int setup_output(
 								                Unit *unit,
 								                const ExecContext *context,
 								                const ExecParameters *params,
 								                int fileno,
 								                int socket_fd,
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                int named_iofds[3],
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								                const char *ident,
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								                uid_t uid,
 								                gid_t gid,
 								                dev_t *journal_stream_dev,
 								                ino_t *journal_stream_ino) {
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								        ExecOutput o;
 								        ExecInput i;
-												execute: robustness against journald failures

Almost every unit logs to the journal. If journald gets a permanent
failure, units would not be able to start (exit code 209/STDOUT).

Add a fallback to /dev/null to avoid making the system entirely
unusable in such a case.

											
										
										
											2013-02-15 22:43:23 +01:00
+								        int r;
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        assert(unit);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(context);
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								        assert(params);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        assert(ident);
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								        assert(journal_stream_dev);
 								        assert(journal_stream_ino);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								        if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 								                if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 								                        return -errno;
 								                return STDOUT_FILENO;
 								        }
 								        if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 								                if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 								                        return -errno;
 								                return STDERR_FILENO;
 								        }
-												core: turn various execution flags into a proper flags parameter

The ExecParameters structure contains a number of bit-flags, that were so far
exposed as bool:1, change this to a proper, single binary bit flag field. This
makes things a bit more expressive, and is helpful as we add more flags, since
these booleans are passed around in various callers, for example
service_spawn(), whose signature can be made much shorter now.

Not all bit booleans from ExecParameters are moved into the flags field for
now, but this can be added later.

											
										
										
											2016-07-26 17:40:35 +02:00
+								        i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
-												execute: simplify stdin/stderr/stdout fixup a little

											
										
										
											2010-05-19 21:50:34 +02:00
+								        o = fixup_output(context->std_output, socket_fd);
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								        if (fileno == STDERR_FILENO) {
 								                ExecOutput e;
 								                e = fixup_output(context->std_error, socket_fd);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                /* This expects the input and output are already set up */
 								                /* Don't change the stderr file descriptor if we inherit all
 								                 * the way and are not on a tty */
 								                if (e == EXEC_OUTPUT_INHERIT &&
 								                    o == EXEC_OUTPUT_INHERIT &&
 								                    i == EXEC_INPUT_NULL &&
 								                    !is_terminal_input(context->std_input) &&
 								                    getppid () != 1)
 								                        return fileno;
 								                /* Duplicate from stdout if possible */
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                        return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                o = e;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								        } else if (o == EXEC_OUTPUT_INHERIT) {
-												execute: inherit from original input, not the fixed up

											
										
										
											2010-07-12 22:04:59 +02:00
+								                /* If input got downgraded, inherit the original value */
 								                if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                        return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
-												execute: inherit from original input, not the fixed up

											
										
										
											2010-07-12 22:04:59 +02:00
-												execute: change stdout inherit logic, when run as PID 1 use /dev/null

											
										
										
											2010-07-07 04:37:42 +02:00
+								                /* If the input is connected to anything that's not a /dev/null, inherit that... */
-												execute: reopen console in the spawned processes, if necessary

											
										
										
											2010-05-20 01:08:13 +02:00
+								                if (i != EXEC_INPUT_NULL)
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                        return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
-												execute: change stdout inherit logic, when run as PID 1 use /dev/null

											
										
										
											2010-07-07 04:37:42 +02:00
+								                /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 								                if (getppid() != 1)
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                        return fileno;
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                /* We need to open /dev/null here anew, to get the right access mode. */
 								                return open_null_as(O_WRONLY, fileno);
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								        switch (o) {
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								        case EXEC_OUTPUT_NULL:
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                return open_null_as(O_WRONLY, fileno);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								        case EXEC_OUTPUT_TTY:
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								                if (is_terminal_input(i))
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                        return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								                /* We don't reset the terminal if this is just about output */
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								        case EXEC_OUTPUT_SYSLOG:
-												execute: optionally forward program output to /dev/console in addition to syslog/kmsg

											
										
										
											2011-02-15 01:27:53 +01:00
+								        case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
-												execute: s/EXEC_OUTPUT_KERNEL/EXEC_OUTPUT_KMSG/ to follow LOG_TARGET_xxx nomenclature

											
										
										
											2010-05-19 21:49:03 +02:00
+								        case EXEC_OUTPUT_KMSG:
-												execute: optionally forward program output to /dev/console in addition to syslog/kmsg

											
										
										
											2011-02-15 01:27:53 +01:00
+								        case EXEC_OUTPUT_KMSG_AND_CONSOLE:
-												journal: introduce log target 'journal' for executed processes

											
										
										
											2012-01-05 23:54:45 +01:00
+								        case EXEC_OUTPUT_JOURNAL:
 								        case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
-												execute: let's decouple execute.c a bit from the unit logic

Let's try to decouple the execution engine a bit from the Unit/Manager
concept, and hence pass one more flag as part of the ExecParameters flags
field.

											
										
										
											2017-08-01 10:28:20 +02:00
+								                r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
-												execute: robustness against journald failures

Almost every unit logs to the journal. If journald gets a permanent
failure, units would not be able to start (exit code 209/STDOUT).

Add a fallback to /dev/null to avoid making the system entirely
unusable in such a case.

											
										
										
											2013-02-15 22:43:23 +01:00
+								                if (r < 0) {
-												execute: downgrade a log message ERR → WARNING, since we proceed ignoring its result

											
										
										
											2017-09-26 17:42:17 +02:00
+								                        log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                        r = open_null_as(O_WRONLY, fileno);
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								                } else {
 								                        struct stat st;
 								                        /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 								                         * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
-												core: make sure that $JOURNAL_STREAM prefers stderr over stdout information (#6824)

If two separate log streams are connected to stdout and stderr, let's
make sure $JOURNAL_STREAM points to the latter, as that's the preferred
log destination, and the environment variable has been created in order
to permit services to automatically upgrade from stderr based logging to
native journal logging.

Also, document this behaviour.

Fixes: #6800
											
										
										
											2017-09-15 08:26:38 +02:00
+								                         * services to detect whether they are connected to the journal or not.
 								                         *
 								                         * If both stdout and stderr are connected to a stream then let's make sure to store the data
 								                         * about STDERR as that's usually the best way to do logging. */
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
-												core: make sure that $JOURNAL_STREAM prefers stderr over stdout information (#6824)

If two separate log streams are connected to stdout and stderr, let's
make sure $JOURNAL_STREAM points to the latter, as that's the preferred
log destination, and the environment variable has been created in order
to permit services to automatically upgrade from stderr based logging to
native journal logging.

Also, document this behaviour.

Fixes: #6800
											
										
										
											2017-09-15 08:26:38 +02:00
+								                        if (fstat(fileno, &st) >= 0 &&
 								                            (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								                                *journal_stream_dev = st.st_dev;
 								                                *journal_stream_ino = st.st_ino;
 								                        }
-												execute: robustness against journald failures

Almost every unit logs to the journal. If journald gets a permanent
failure, units would not be able to start (exit code 209/STDOUT).

Add a fallback to /dev/null to avoid making the system entirely
unusable in such a case.

											
										
										
											2013-02-15 22:43:23 +01:00
+								                }
 								                return r;
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
 								        case EXEC_OUTPUT_SOCKET:
 								                assert(socket_fd >= 0);
-												execute: unify setup_{output,error}

The functions are quite similar. Unify them into one.

The source gets shorter, the binary gets slightly smaller.

											
										
										
											2013-02-15 23:36:23 +01:00
+								                return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        case EXEC_OUTPUT_NAMED_FD:
 								                (void) fd_nonblock(named_iofds[fileno], false);
 								                return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        default:
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                assert_not_reached("Unknown error type");
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        }
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
+								}
-												execute: chown() the tty when running owning them

											
										
										
											2010-04-13 18:50:43 +02:00
+								static int chown_terminal(int fd, uid_t uid) {
 								        struct stat st;
 								        assert(fd >= 0);
-												execute: check whether the specified fd is a tty before chowning/chmoding  it (#3457)

Let's add an extra safety check before we chmod/chown a TTY to the right user,
as we might end up having connected something to STDIN/STDOUT that is actually
not a TTY, even though this might have been requested, due to permissive
StandardInput= settings or transient service activation with fds passed in.

Fixes:

https://bugs.freedesktop.org/show_bug.cgi?id=85255
											
										
										
											2016-06-09 10:01:16 +02:00
+								        /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 								        if (isatty(fd) < 1)
 								                return 0;
-												execute: chown() the tty when running owning them

											
										
										
											2010-04-13 18:50:43 +02:00
+								        /* This might fail. What matters are the results. */
-												make gcc shut up

											
										
										
											2010-05-10 03:34:31 +02:00
+								        (void) fchown(fd, uid, -1);
 								        (void) fchmod(fd, TTY_MODE);
-												execute: chown() the tty when running owning them

											
										
										
											2010-04-13 18:50:43 +02:00
 								        if (fstat(fd, &st) < 0)
 								                return -errno;
-												execute: fix terminal chowning logic

											
										
										
											2010-04-13 21:13:49 +02:00
+								        if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
-												execute: chown() the tty when running owning them

											
										
										
											2010-04-13 18:50:43 +02:00
+								                return -EPERM;
 								        return 0;
 								}
-												core: allow to redirect confirmation messages to a different console

It's rather hard to parse the confirmation messages (enabled with
systemd.confirm_spawn=true) amongst the status messages and the kernel
ones (if enabled).

This patch gives the possibility to the user to redirect the confirmation
message to a different virtual console, either by giving its name or its path,
so those messages are separated from the other ones and easier to read.

											
										
										
											2016-11-02 10:38:22 +01:00
+								static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 								        int r;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								        assert(_saved_stdin);
 								        assert(_saved_stdout);
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 								        if (saved_stdin < 0)
 								                return -errno;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        if (saved_stdout < 0)
 								                return -errno;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: allow to redirect confirmation messages to a different console

It's rather hard to parse the confirmation messages (enabled with
systemd.confirm_spawn=true) amongst the status messages and the kernel
ones (if enabled).

This patch gives the possibility to the user to redirect the confirmation
message to a different virtual console, either by giving its name or its path,
so those messages are separated from the other ones and easier to read.

											
										
										
											2016-11-02 10:38:22 +01:00
+								        fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        if (fd < 0)
 								                return fd;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        r = chown_terminal(fd, getuid());
 								        if (r < 0)
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								                return r;
-												execute: chown() the tty when running owning them

											
										
										
											2010-04-13 18:50:43 +02:00
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        r = reset_terminal_fd(fd, true);
 								        if (r < 0)
 								                return r;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        if (dup2(fd, STDIN_FILENO) < 0)
 								                return -errno;
 								        if (dup2(fd, STDOUT_FILENO) < 0)
 								                return -errno;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								        if (fd >= 2)
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								                safe_close(fd);
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        fd = -1;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
 								        *_saved_stdin = saved_stdin;
 								        *_saved_stdout = saved_stdout;
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        saved_stdin = saved_stdout = -1;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        return 0;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								}
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								static void write_confirm_error_fd(int err, int fd, const Unit *u) {
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        assert(err < 0);
 								        if (err == -ETIMEDOUT)
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								                dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        else {
 								                errno = -err;
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								                dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        }
 								}
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								static void write_confirm_error(int err, const char *vc, const Unit *u) {
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								        _cleanup_close_ int fd = -1;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        assert(vc);
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: allow to redirect confirmation messages to a different console

It's rather hard to parse the confirmation messages (enabled with
systemd.confirm_spawn=true) amongst the status messages and the kernel
ones (if enabled).

This patch gives the possibility to the user to redirect the confirmation
message to a different virtual console, either by giving its name or its path,
so those messages are separated from the other ones and easier to read.

											
										
										
											2016-11-02 10:38:22 +01:00
+								        fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        if (fd < 0)
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                return;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								        write_confirm_error_fd(err, fd, u);
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								}
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        int r = 0;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        assert(saved_stdin);
 								        assert(saved_stdout);
 								        release_terminal();
 								        if (*saved_stdin >= 0)
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                if (dup2(*saved_stdin, STDIN_FILENO) < 0)
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								                        r = -errno;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        if (*saved_stdout >= 0)
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								                        r = -errno;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
-												util: do not reset terminal in acquire_terminal()

Before, we'd always reset acquired terminals, which is not really
desired, as we expose a setting TTYReset= which is supposed to control
whether the TTY is reset or not. Previously that setting would only
enable a second resetting of the TTY, which is of course pointless...

Hence, move the implicit resetting out of acquire_terminal() and make
the callers do it if they need it.

											
										
										
											2015-10-08 14:33:53 +02:00
+								        *saved_stdin = safe_close(*saved_stdin);
 								        *saved_stdout = safe_close(*saved_stdout);
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
 								        return r;
 								}
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								enum {
 								        CONFIRM_PRETEND_FAILURE = -1,
 								        CONFIRM_PRETEND_SUCCESS =  0,
 								        CONFIRM_EXECUTE = 1,
 								};
-												core: add 'i' in confirm spawn to give a short summary of the unit to spawn

											
										
										
											2016-11-12 14:55:12 +01:00
+								static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        int saved_stdout = -1, saved_stdin = -1, r;
-												core: limit the length of the confirmation question

When "confirmation_spawn=1", the confirmation question can look like:

  Execute /usr/bin/kmod static-nodes --format=tmpfiles --output=/run/tmpfiles.d/kmod.conf? [Yes, No, Skip]

which is pretty verbose and might not fit in the console width size (which is
usually 80 chars) and thus question will be splitted into 2 consecutive lines.

However since the question is now refreshed every 2 secs, the reprinted
question will overwrite the second line of the previous one...

To prevent this, this patch makes sure that the command line won't be longer
than 60 chars by ellipsizing it if the command is longer:

  Execute /usr/bin/kmod static-nodes --format=tmpfiles --output=/ru…nf? [Yes, No, View, Skip]

A following patch will introduce a new choice that will allow the user to get
details on the command to be executed so it will still be possible to see the
full command line.

											
										
										
											2016-11-07 17:14:59 +01:00
+								        _cleanup_free_ char *e = NULL;
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        char c;
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        /* For any internal errors, assume a positive response. */
-												core: allow to redirect confirmation messages to a different console

It's rather hard to parse the confirmation messages (enabled with
systemd.confirm_spawn=true) amongst the status messages and the kernel
ones (if enabled).

This patch gives the possibility to the user to redirect the confirmation
message to a different virtual console, either by giving its name or its path,
so those messages are separated from the other ones and easier to read.

											
										
										
											2016-11-02 10:38:22 +01:00
+								        r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								        if (r < 0) {
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								                write_confirm_error(r, vc, u);
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                return CONFIRM_EXECUTE;
 								        }
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
-												core: add 'c' in confirmation_spawn to resume the boot process

											
										
										
											2016-11-15 09:29:04 +01:00
+								        /* confirm_spawn might have been disabled while we were sleeping. */
 								        if (manager_is_confirm_spawn_disabled(u->manager)) {
 								                r = 1;
 								                goto restore_stdio;
 								        }
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
-												core: limit the length of the confirmation question

When "confirmation_spawn=1", the confirmation question can look like:

  Execute /usr/bin/kmod static-nodes --format=tmpfiles --output=/run/tmpfiles.d/kmod.conf? [Yes, No, Skip]

which is pretty verbose and might not fit in the console width size (which is
usually 80 chars) and thus question will be splitted into 2 consecutive lines.

However since the question is now refreshed every 2 secs, the reprinted
question will overwrite the second line of the previous one...

To prevent this, this patch makes sure that the command line won't be longer
than 60 chars by ellipsizing it if the command is longer:

  Execute /usr/bin/kmod static-nodes --format=tmpfiles --output=/ru…nf? [Yes, No, View, Skip]

A following patch will introduce a new choice that will allow the user to get
details on the command to be executed so it will still be possible to see the
full command line.

											
										
										
											2016-11-07 17:14:59 +01:00
+								        e = ellipsize(cmdline, 60, 100);
 								        if (!e) {
 								                log_oom();
 								                r = CONFIRM_EXECUTE;
 								                goto restore_stdio;
 								        }
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								        for (;;) {
-												core: in confirm spawn, suggest 'f' when user selects 'n' choice

											
										
										
											2016-11-17 18:22:43 +01:00
+								                r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                if (r < 0) {
-												core: include the unit name when notifying that a confirmation question timed out

											
										
										
											2016-11-10 10:07:42 +01:00
+								                        write_confirm_error_fd(r, STDOUT_FILENO, u);
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                        r = CONFIRM_EXECUTE;
 								                        goto restore_stdio;
 								                }
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                switch (c) {
-												core: add 'c' in confirmation_spawn to resume the boot process

											
										
										
											2016-11-15 09:29:04 +01:00
+								                case 'c':
 								                        printf("Resuming normal execution.\n");
 								                        manager_disable_confirm_spawn();
 								                        r = 1;
 								                        break;
-												core: add 'D' in confirmat spawn to show a full dump of the unit to spawn

											
										
										
											2016-11-12 15:08:29 +01:00
+								                case 'D':
 								                        unit_dump(u, stdout, "  ");
 								                        continue; /* ask again */
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                case 'f':
 								                        printf("Failing execution.\n");
 								                        r = CONFIRM_PRETEND_FAILURE;
 								                        break;
 								                case 'h':
-												core: add 'c' in confirmation_spawn to resume the boot process

											
										
										
											2016-11-15 09:29:04 +01:00
+								                        printf("  c - continue, proceed without asking anymore\n"
 								                               "  D - dump, show the state of the unit\n"
-												core: add 'D' in confirmat spawn to show a full dump of the unit to spawn

											
										
										
											2016-11-12 15:08:29 +01:00
+								                               "  f - fail, don't execute the command and pretend it failed\n"
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                               "  h - help\n"
-												core: add 'i' in confirm spawn to give a short summary of the unit to spawn

											
										
										
											2016-11-12 14:55:12 +01:00
+								                               "  i - info, show a short summary of the unit\n"
-												core: add 'j' in confirmation_spawn to list the jobs that are in progress

											
										
										
											2016-11-13 16:28:04 +01:00
+								                               "  j - jobs, show jobs that are in progress\n"
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                               "  s - skip, don't execute the command and pretend it succeeded\n"
 								                               "  y - yes, execute the command\n");
-												core: add 'D' in confirmat spawn to show a full dump of the unit to spawn

											
										
										
											2016-11-12 15:08:29 +01:00
+								                        continue; /* ask again */
-												core: add 'i' in confirm spawn to give a short summary of the unit to spawn

											
										
										
											2016-11-12 14:55:12 +01:00
+								                case 'i':
 								                        printf("  Description: %s\n"
 								                               "  Unit:        %s\n"
 								                               "  Command:     %s\n",
 								                               u->id, u->description, cmdline);
 								                        continue; /* ask again */
-												core: add 'j' in confirmation_spawn to list the jobs that are in progress

											
										
										
											2016-11-13 16:28:04 +01:00
+								                case 'j':
 								                        manager_dump_jobs(u->manager, stdout, "  ");
 								                        continue; /* ask again */
-												core: in confirm spawn, suggest 'f' when user selects 'n' choice

											
										
										
											2016-11-17 18:22:43 +01:00
+								                case 'n':
 								                        /* 'n' was removed in favor of 'f'. */
 								                        printf("Didn't understand 'n', did you mean 'f'?\n");
 								                        continue; /* ask again */
-												core: rework the confirmation spawn prompt

Previously it was "[Yes, Fail, Skip]" which is pretty misleading because it
suggests that the whole word needs to be entered instead of a single char.

Also this won't fit well when we'll extend the number of choices.

This patch addresses this by changing the choice hint with "[y, f, s – h for help]"
so it's now clear that a single letter has to be entered.

It also introduces a new choice 'h' which describes all possible choices since
a single letter can be not descriptive enough for new users.

It also allow to stick with the same hint string regardless of how
many choices we will support.

											
										
										
											2016-11-07 17:14:59 +01:00
+								                case 's':
 								                        printf("Skipping execution.\n");
 								                        r = CONFIRM_PRETEND_SUCCESS;
 								                        break;
 								                case 'y':
 								                        r = CONFIRM_EXECUTE;
 								                        break;
 								                default:
 								                        assert_not_reached("Unhandled choice");
 								                }
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                break;
 								        }
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								restore_stdio:
-												core: make systemd.confirm_spawn=1 actually work

This adds a timeout if the TTY cannot be acquired and makes sure we
always output the question to the console, never to the TTY of the
respective service.

											
										
										
											2012-06-26 12:16:18 +02:00
+								        restore_confirm_stdio(&saved_stdin, &saved_stdout);
 								        return r;
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								}
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								static int get_fixed_user(const ExecContext *c, const char **user,
 								                          uid_t *uid, gid_t *gid,
 								                          const char **home, const char **shell) {
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
+								        int r;
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        const char *name;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        assert(c);
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												Revert "core/execute: set HOME, USER also for root users"

This reverts commit 8b89628a10af3863bfc97872912e9da4076a5929.

This broke #5246

											
										
										
											2017-02-09 11:43:44 +01:00
+								        if (!c->user)
 								                return 0;
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 								         * (i.e. are "/" or "/bin/nologin"). */
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												Revert "core/execute: set HOME, USER also for root users"

This reverts commit 8b89628a10af3863bfc97872912e9da4076a5929.

This broke #5246

											
										
										
											2017-02-09 11:43:44 +01:00
+								        name = c->user;
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        r = get_user_creds_clean(&name, uid, gid, home, shell);
 								        if (r < 0)
 								                return r;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        *user = name;
 								        return 0;
 								}
 								static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 								        int r;
 								        const char *name;
 								        assert(c);
 								        if (!c->group)
 								                return 0;
 								        name = c->group;
 								        r = get_group_creds(&name, gid);
 								        if (r < 0)
 								                return r;
 								        *group = name;
 								        return 0;
 								}
-												core: intialize user aux groups and SupplementaryGroups= when DynamicUser= is set

Make sure that when DynamicUser= is set that we intialize the user
supplementary groups and that we also support SupplementaryGroups=

Fixes: https://github.com/systemd/systemd/issues/4539

Thanks Evgeny Vereshchagin (@evverx)

											
										
										
											2016-11-02 22:42:40 +01:00
+								static int get_supplementary_groups(const ExecContext *c, const char *user,
 								                                    const char *group, gid_t gid,
 								                                    gid_t **supplementary_gids, int *ngids) {
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        char **i;
 								        int r, k = 0;
 								        int ngroups_max;
 								        bool keep_groups = false;
 								        gid_t *groups = NULL;
 								        _cleanup_free_ gid_t *l_gids = NULL;
 								        assert(c);
-												core: initialize groups list before checking SupplementaryGroups= of a unit (#4533)

Always initialize the supplementary groups of caller before checking the
unit SupplementaryGroups= option.

Fixes https://github.com/systemd/systemd/issues/4531
											
										
										
											2016-11-02 17:51:35 +01:00
+								        /*
 								         * If user is given, then lookup GID and supplementary groups list.
 								         * We avoid NSS lookups for gid=0. Also we have to initialize groups
-												core: intialize user aux groups and SupplementaryGroups= when DynamicUser= is set

Make sure that when DynamicUser= is set that we intialize the user
supplementary groups and that we also support SupplementaryGroups=

Fixes: https://github.com/systemd/systemd/issues/4539

Thanks Evgeny Vereshchagin (@evverx)

											
										
										
											2016-11-02 22:42:40 +01:00
+								         * here and as early as possible so we keep the list of supplementary
 								         * groups of the caller.
-												core: initialize groups list before checking SupplementaryGroups= of a unit (#4533)

Always initialize the supplementary groups of caller before checking the
unit SupplementaryGroups= option.

Fixes https://github.com/systemd/systemd/issues/4531
											
										
										
											2016-11-02 17:51:35 +01:00
+								         */
 								        if (user && gid_is_valid(gid) && gid != 0) {
 								                /* First step, initialize groups from /etc/groups */
 								                if (initgroups(user, gid) < 0)
 								                        return -errno;
 								                keep_groups = true;
 								        }
-												core: use strv_isempty to check if supplementary_groups is empty

With the previous commit, we know that it will be NULL if empty, but
it's safe to always use strv_isempty() in case the code changes
in the future.

											
										
										
											2017-10-04 11:33:30 +02:00
+								        if (strv_isempty(c->supplementary_groups))
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                return 0;
-												core: do not assert when sysconf(_SC_NGROUPS_MAX) fails (#4466)

Remove the assert and check the return code of sysconf(_SC_NGROUPS_MAX).

_SC_NGROUPS_MAX maps to NGROUPS_MAX which is defined in <limits.h> to
65536 these days. The value is a sysctl read-only
/proc/sys/kernel/ngroups_max and the kernel assumes that it is always
positive otherwise things may break. Follow this and support only
positive values for all other case return either -errno or -EOPNOTSUPP.

Now if there are systems that want to re-write NGROUPS_MAX then they
should not pass SupplementaryGroups= in units even if it is empty, in
this case nothing fails and we just ignore supplementary groups. However
if SupplementaryGroups= is passed even if it is empty we have to assume
that there will be groups manipulation from our side or the kernel and
since the kernel always assumes that NGROUPS_MAX is positive, then
follow that and support only positive values.
											
										
										
											2016-10-24 13:13:06 +02:00
+								        /*
 								         * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 								         * be positive, otherwise fail.
 								         */
 								        errno = 0;
 								        ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 								        if (ngroups_max <= 0) {
 								                if (errno > 0)
 								                        return -errno;
 								                else
 								                        return -EOPNOTSUPP; /* For all other values */
 								        }
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        l_gids = new(gid_t, ngroups_max);
 								        if (!l_gids)
 								                return -ENOMEM;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        if (keep_groups) {
 								                /*
 								                 * Lookup the list of groups that the user belongs to, we
 								                 * avoid NSS lookups here too for gid=0.
 								                 */
 								                k = ngroups_max;
 								                if (getgrouplist(user, gid, l_gids, &k) < 0)
 								                        return -EINVAL;
 								        } else
 								                k = 0;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        STRV_FOREACH(i, c->supplementary_groups) {
 								                const char *g;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                if (k >= ngroups_max)
 								                        return -E2BIG;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                g = *i;
 								                r = get_group_creds(&g, l_gids+k);
 								                if (r < 0)
 								                        return r;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                k++;
 								        }
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        /*
 								         * Sets ngids to zero to drop all supplementary groups, happens
 								         * when we are under root and SupplementaryGroups= is empty.
 								         */
 								        if (k == 0) {
 								                *ngids = 0;
 								                return 0;
 								        }
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        /* Otherwise get the final list of supplementary groups */
 								        groups = memdup(l_gids, sizeof(gid_t) * k);
 								        if (!groups)
 								                return -ENOMEM;
 								        *supplementary_gids = groups;
 								        *ngids = k;
 								        groups = NULL;
 								        return 0;
 								}
-												core: cleanup for enforce_groups() (#7064)

SupplementaryGroups= is preprocessed in get_supplementary_groups().
So, it is not necessary to input ExecContext to enforce_groups().
											
										
										
											2017-10-12 08:10:25 +02:00
+								static int enforce_groups(gid_t gid, gid_t *supplementary_gids, int ngids) {
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        int r;
-												core: cleanup for enforce_groups() (#7064)

SupplementaryGroups= is preprocessed in get_supplementary_groups().
So, it is not necessary to input ExecContext to enforce_groups().
											
										
										
											2017-10-12 08:10:25 +02:00
+								        /* Handle SupplementaryGroups= if it is not empty */
 								        if (ngids > 0) {
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                r = maybe_setgroups(ngids, supplementary_gids);
 								                if (r < 0)
-												user-util: rework maybe_setgroups() a bit

Let's drop the caching of the setgroups /proc field for now. While there's a
strict regime in place when it changes states, let's better not cache it since
we cannot really be sure we follow that regime correctly.

More importantly however, this is not in performance sensitive code, and
there's no indication the cache is really beneficial, hence let's drop the
caching and make things a bit simpler.

Also, while we are at it, rework the error handling a bit, and always return
negative errno-style error codes, following our usual coding style. This has
the benefit that we can sensible hanld read_one_line_file() errors, without
having to updat errno explicitly.

											
										
										
											2016-10-06 17:54:12 +02:00
+								                        return r;
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        }
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        if (gid_is_valid(gid)) {
 								                /* Then set our gids */
 								                if (setresgid(gid, gid, gid) < 0)
 								                        return -errno;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
+								        }
 								        return 0;
 								}
 								static int enforce_user(const ExecContext *context, uid_t uid) {
 								        assert(context);
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        if (!uid_is_valid(uid))
 								                return 0;
-												core: drop Capabilities= setting

The setting is hardly useful (since its effect is generally reduced to zero due
to file system caps), and with the advent of ambient caps an actually useful
replacement exists, hence let's get rid of this.

I am pretty sure this was unused and our man page already recommended against
its use, hence this should be a safe thing to remove.

											
										
										
											2016-02-12 23:29:57 +01:00
+								        /* Sets (but doesn't look up) the uid and make sure we keep the
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
+								         * capabilities while doing so. */
-												core: drop Capabilities= setting

The setting is hardly useful (since its effect is generally reduced to zero due
to file system caps), and with the advent of ambient caps an actually useful
replacement exists, hence let's get rid of this.

I am pretty sure this was unused and our man page already recommended against
its use, hence this should be a safe thing to remove.

											
										
										
											2016-02-12 23:29:57 +01:00
+								        if (context->capability_ambient_set != 0) {
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
 								                /* First step: If we need to keep capabilities but
 								                 * drop privileges we need to make sure we keep our
-												execute: convert secure bits into mask properly

C.f. http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=5975c725dfd6f7d36f493ab1453fbdbd35c1f0e3

											
										
										
											2013-03-30 06:40:11 +01:00
+								                 * caps, while we drop privileges. */
-												execute: try to suppress PR_SET_SECUREBITS if unnecessary

											
										
										
											2010-03-31 16:25:33 +02:00
+								                if (uid != 0) {
-												execute: convert secure bits into mask properly

C.f. http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=5975c725dfd6f7d36f493ab1453fbdbd35c1f0e3

											
										
										
											2013-03-30 06:40:11 +01:00
+								                        int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
-												execute: try to suppress PR_SET_SECUREBITS if unnecessary

											
										
										
											2010-03-31 16:25:33 +02:00
 								                        if (prctl(PR_GET_SECUREBITS) != sb)
 								                                if (prctl(PR_SET_SECUREBITS, sb) < 0)
 								                                        return -errno;
 								                }
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
+								        }
-												core: drop Capabilities= setting

The setting is hardly useful (since its effect is generally reduced to zero due
to file system caps), and with the advent of ambient caps an actually useful
replacement exists, hence let's get rid of this.

I am pretty sure this was unused and our man page already recommended against
its use, hence this should be a safe thing to remove.

											
										
										
											2016-02-12 23:29:57 +01:00
+								        /* Second step: actually set the uids */
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
+								        if (setresuid(uid, uid, uid) < 0)
 								                return -errno;
 								        /* At this point we should have all necessary capabilities but
 								           are otherwise a normal user. However, the caps might got
 								           corrupted due to the setresuid() so we need clean them up
 								           later. This is done outside of this call. */
 								        return 0;
 								}
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_PAM
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								static int null_conv(
 								                int num_msg,
 								                const struct pam_message **msg,
 								                struct pam_response **resp,
 								                void *appdata_ptr) {
 								        /* We don't support conversations */
 								        return PAM_CONV_ERR;
 								}
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
+								#endif
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								static int setup_pam(
 								                const char *name,
 								                const char *user,
-												sd-pam: Drop uid so parent signal arrives at child.

The PAM helper thread needs to capture the death signal from the
parent, but is prohibited from doing so since when the child dies
as normal user, the kernel won't allow it to send a TERM to the
PAM helper thread which is running as root.

This causes the PAM threads to never exit, accumulating after
user sessions exit.

There is however really no need to keep the PAM threads running as
root, so, we can just setresuid() to the same user as defined in the
unit file for the parent thread (User=). This makes the TERM signal
arrive as normal. In case setresuid() fails, we ignore the error, so
we at least fall back to the current behaviour.

											
										
										
											2012-05-17 21:17:42 +02:00
+								                uid_t uid,
-												core: leave PAM stub process around with GIDs updated

In the process execution code of PID 1, before
096424d1230e0a0339735c51b43949809e972430 the GID settings where changed before
invoking PAM, and the UID settings after. After the change both changes are
made after the PAM session hooks are run. When invoking PAM we fork once, and
leave a stub process around which will invoke the PAM session end hooks when
the session goes away. This code previously was dropping the remaining privs
(which were precisely the UID). Fix this code to do this correctly again, by
really dropping them else (i.e. the GID as well).

While we are at it, also fix error logging of this code.

Fixes: #4238

											
										
										
											2016-10-06 16:03:01 +02:00
+								                gid_t gid,
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                const char *tty,
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								                char ***env,
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                int fds[], unsigned n_fds) {
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_PAM
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        static const struct pam_conv conv = {
 								                .conv = null_conv,
 								                .appdata_ptr = NULL
 								        };
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
+								        _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        pam_handle_t *handle = NULL;
-												core: execute: fix regression in pam_setup()

Commit 72c0a2c25 ("everywhere: port everything to sigprocmask_many()
and friends") reworked code tree-wide to use the new sigprocmask_many()
helper. In this, it caused a regression in pam_setup, because it
dropped a line to initialize the 'ss' signal mask which is later used
in sigwait().

While at it, move the variable declaration to an inner scope.

											
										
										
											2015-06-17 14:31:49 +02:00
+								        sigset_t old_ss;
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								        int pam_code = PAM_SUCCESS, r;
-												execute: Do not alter call-by-ref parameter on failure

Prevent free from being called on (a part of) the call-by-reference
variable env when setup_pam fails.

											
										
										
											2016-07-07 12:41:52 +02:00
+								        char **nv, **e = NULL;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        bool close_session = false;
 								        pid_t pam_pid = 0, parent_pid;
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								        int flags = 0;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								        assert(name);
 								        assert(user);
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        assert(env);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								        /* We set up PAM in the parent process, then fork. The child
-												Spelling Corrections

Just some lame spelling corrections with no functionality.

											
										
										
											2011-02-21 15:32:17 +01:00
+								         * will then stay around until killed via PR_GET_PDEATHSIG or
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								         * systemd via the cgroup logic. It will then remove the PAM
 								         * session again. The parent process will exec() the actual
 								         * daemon. We do things this way to ensure that the main PID
 								         * of the daemon is the one we initially fork()ed. */
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								        r = barrier_create(&barrier);
 								        if (r < 0)
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
+								                goto fail;
-												tree-wide: remove unnecessary LOG_PRI

LOG_DEBUG is already a log level, there is no need to use LOG_PRI which
is for filtering out the facility.

											
										
										
											2015-01-06 06:29:40 +01:00
+								        if (log_get_max_level() < LOG_DEBUG)
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                flags |= PAM_SILENT;
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        pam_code = pam_start(name, user, &conv, &handle);
 								        if (pam_code != PAM_SUCCESS) {
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                handle = NULL;
 								                goto fail;
 								        }
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        if (tty) {
 								                pam_code = pam_set_item(handle, PAM_TTY, tty);
 								                if (pam_code != PAM_SUCCESS)
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                        goto fail;
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												execute: Do not alter call-by-ref parameter on failure

Prevent free from being called on (a part of) the call-by-reference
variable env when setup_pam fails.

											
										
										
											2016-07-07 12:41:52 +02:00
+								        STRV_FOREACH(nv, *env) {
 								                pam_code = pam_putenv(handle, *nv);
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								                if (pam_code != PAM_SUCCESS)
 								                        goto fail;
 								        }
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								        pam_code = pam_acct_mgmt(handle, flags);
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        if (pam_code != PAM_SUCCESS)
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                goto fail;
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								        pam_code = pam_open_session(handle, flags);
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        if (pam_code != PAM_SUCCESS)
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                goto fail;
 								        close_session = true;
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        e = pam_getenvlist(handle);
 								        if (!e) {
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                pam_code = PAM_BUF_ERR;
 								                goto fail;
 								        }
 								        /* Block SIGTERM, so that we know that it won't get lost in
 								         * the child */
-												tree-wide: whenever we fork off a foreign child process reset signal mask/handlers

Also, when the child is potentially long-running make sure to set a
death signal.

Also, ignore the result of the reset operations explicitly by casting
them to (void).

											
										
										
											2015-05-31 23:55:55 +02:00
-												everywhere: port everything to sigprocmask_many() and friends

This ports a lot of manual code over to sigprocmask_many() and friends.

Also, we now consistly check for sigprocmask() failures with
assert_se(), since the call cannot realistically fail unless there's a
programming error.

Also encloses a few sd_event_add_signal() calls with (void) when we
ignore the return values for it knowingly.

											
										
										
											2015-06-15 20:13:23 +02:00
+								        assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												tree-wide: make use of getpid_cached() wherever we can

This moves pretty much all uses of getpid() over to getpid_raw(). I
didn't specifically check whether the optimization is worth it for each
replacement, but in order to keep things simple and systematic I
switched over everything at once.

											
										
										
											2017-07-20 16:19:18 +02:00
+								        parent_pid = getpid_cached();
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								        pam_pid = fork();
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								        if (pam_pid < 0) {
 								                r = -errno;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                goto fail;
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								        }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								        if (pam_pid == 0) {
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								                int sig, ret = EXIT_PAM;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								                /* The child's job is to reset the PAM session on
 								                 * termination */
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
+								                barrier_set_role(&barrier, BARRIER_CHILD);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								                /* This string must fit in 10 chars (i.e. the length
-												exec: include path name of binary we are about to execute when renaming forked off processes

Immediately after forking off a process change the comm name and argv[0]
to "(foobar)" where "foobar" is the basename of the path we are about to
execute.

This should be useful when charting boot progress.

											
										
										
											2012-02-01 22:33:15 +01:00
+								                 * of "/sbin/init"), to look pretty in /bin/ps */
 								                rename_process("(sd-pam)");
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								                /* Make sure we don't keep open the passed fds in this
 								                child. We assume that otherwise only those fds are
 								                open here that have been opened by PAM. */
 								                close_many(fds, n_fds);
-												sd-pam: Drop uid so parent signal arrives at child.

The PAM helper thread needs to capture the death signal from the
parent, but is prohibited from doing so since when the child dies
as normal user, the kernel won't allow it to send a TERM to the
PAM helper thread which is running as root.

This causes the PAM threads to never exit, accumulating after
user sessions exit.

There is however really no need to keep the PAM threads running as
root, so, we can just setresuid() to the same user as defined in the
unit file for the parent thread (User=). This makes the TERM signal
arrive as normal. In case setresuid() fails, we ignore the error, so
we at least fall back to the current behaviour.

											
										
										
											2012-05-17 21:17:42 +02:00
+								                /* Drop privileges - we don't need any to pam_close_session
 								                 * and this will make PR_SET_PDEATHSIG work in most cases.
 								                 * If this fails, ignore the error - but expect sd-pam threads
 								                 * to fail to exit normally */
-												core: leave PAM stub process around with GIDs updated

In the process execution code of PID 1, before
096424d1230e0a0339735c51b43949809e972430 the GID settings where changed before
invoking PAM, and the UID settings after. After the change both changes are
made after the PAM session hooks are run. When invoking PAM we fork once, and
leave a stub process around which will invoke the PAM session end hooks when
the session goes away. This code previously was dropping the remaining privs
(which were precisely the UID). Fix this code to do this correctly again, by
really dropping them else (i.e. the GID as well).

While we are at it, also fix error logging of this code.

Fixes: #4238

											
										
										
											2016-10-06 16:03:01 +02:00
-												user-util: rework maybe_setgroups() a bit

Let's drop the caching of the setgroups /proc field for now. While there's a
strict regime in place when it changes states, let's better not cache it since
we cannot really be sure we follow that regime correctly.

More importantly however, this is not in performance sensitive code, and
there's no indication the cache is really beneficial, hence let's drop the
caching and make things a bit simpler.

Also, while we are at it, rework the error handling a bit, and always return
negative errno-style error codes, following our usual coding style. This has
the benefit that we can sensible hanld read_one_line_file() errors, without
having to updat errno explicitly.

											
										
										
											2016-10-06 17:54:12 +02:00
+								                r = maybe_setgroups(0, NULL);
 								                if (r < 0)
 								                        log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
-												core: leave PAM stub process around with GIDs updated

In the process execution code of PID 1, before
096424d1230e0a0339735c51b43949809e972430 the GID settings where changed before
invoking PAM, and the UID settings after. After the change both changes are
made after the PAM session hooks are run. When invoking PAM we fork once, and
leave a stub process around which will invoke the PAM session end hooks when
the session goes away. This code previously was dropping the remaining privs
(which were precisely the UID). Fix this code to do this correctly again, by
really dropping them else (i.e. the GID as well).

While we are at it, also fix error logging of this code.

Fixes: #4238

											
										
										
											2016-10-06 16:03:01 +02:00
+								                if (setresgid(gid, gid, gid) < 0)
 								                        log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
-												sd-pam: Drop uid so parent signal arrives at child.

The PAM helper thread needs to capture the death signal from the
parent, but is prohibited from doing so since when the child dies
as normal user, the kernel won't allow it to send a TERM to the
PAM helper thread which is running as root.

This causes the PAM threads to never exit, accumulating after
user sessions exit.

There is however really no need to keep the PAM threads running as
root, so, we can just setresuid() to the same user as defined in the
unit file for the parent thread (User=). This makes the TERM signal
arrive as normal. In case setresuid() fails, we ignore the error, so
we at least fall back to the current behaviour.

											
										
										
											2012-05-17 21:17:42 +02:00
+								                if (setresuid(uid, uid, uid) < 0)
-												core: leave PAM stub process around with GIDs updated

In the process execution code of PID 1, before
096424d1230e0a0339735c51b43949809e972430 the GID settings where changed before
invoking PAM, and the UID settings after. After the change both changes are
made after the PAM session hooks are run. When invoking PAM we fork once, and
leave a stub process around which will invoke the PAM session end hooks when
the session goes away. This code previously was dropping the remaining privs
(which were precisely the UID). Fix this code to do this correctly again, by
really dropping them else (i.e. the GID as well).

While we are at it, also fix error logging of this code.

Fixes: #4238

											
										
										
											2016-10-06 16:03:01 +02:00
+								                        log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
-												sd-pam: Drop uid so parent signal arrives at child.

The PAM helper thread needs to capture the death signal from the
parent, but is prohibited from doing so since when the child dies
as normal user, the kernel won't allow it to send a TERM to the
PAM helper thread which is running as root.

This causes the PAM threads to never exit, accumulating after
user sessions exit.

There is however really no need to keep the PAM threads running as
root, so, we can just setresuid() to the same user as defined in the
unit file for the parent thread (User=). This makes the TERM signal
arrive as normal. In case setresuid() fails, we ignore the error, so
we at least fall back to the current behaviour.

											
										
										
											2012-05-17 21:17:42 +02:00
-												tree-wide: whenever we fork off a foreign child process reset signal mask/handlers

Also, when the child is potentially long-running make sure to set a
death signal.

Also, ignore the result of the reset operations explicitly by casting
them to (void).

											
										
										
											2015-05-31 23:55:55 +02:00
+								                (void) ignore_signals(SIGPIPE, -1);
-												sd-pam: Drop uid so parent signal arrives at child.

The PAM helper thread needs to capture the death signal from the
parent, but is prohibited from doing so since when the child dies
as normal user, the kernel won't allow it to send a TERM to the
PAM helper thread which is running as root.

This causes the PAM threads to never exit, accumulating after
user sessions exit.

There is however really no need to keep the PAM threads running as
root, so, we can just setresuid() to the same user as defined in the
unit file for the parent thread (User=). This makes the TERM signal
arrive as normal. In case setresuid() fails, we ignore the error, so
we at least fall back to the current behaviour.

											
										
										
											2012-05-17 21:17:42 +02:00
+								                /* Wait until our parent died. This will only work if
 								                 * the above setresuid() succeeds, otherwise the kernel
 								                 * will not allow unprivileged parents kill their privileged
 								                 * children this way. We rely on the control groups kill logic
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                 * to do the rest for us. */
 								                if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
 								                        goto child_finish;
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
+								                /* Tell the parent that our setup is done. This is especially
 								                 * important regarding dropping privileges. Otherwise, unit
-												core/execute: add (void)

CID #778045.

											
										
										
											2017-02-19 19:48:59 +01:00
+								                 * setup might race against our setresuid(2) call.
 								                 *
 								                 * If the parent aborted, we'll detect this below, hence ignore
 								                 * return failure here. */
 								                (void) barrier_place(&barrier);
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
-												core/execute: add (void)

CID #778045.

											
										
										
											2017-02-19 19:48:59 +01:00
+								                /* Check if our parent process might already have died? */
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                if (getppid() == parent_pid) {
-												core: execute: fix regression in pam_setup()

Commit 72c0a2c25 ("everywhere: port everything to sigprocmask_many()
and friends") reworked code tree-wide to use the new sigprocmask_many()
helper. In this, it caused a regression in pam_setup, because it
dropped a line to initialize the 'ss' signal mask which is later used
in sigwait().

While at it, move the variable declaration to an inner scope.

											
										
										
											2015-06-17 14:31:49 +02:00
+								                        sigset_t ss;
 								                        assert_se(sigemptyset(&ss) >= 0);
 								                        assert_se(sigaddset(&ss, SIGTERM) >= 0);
-												execute: invoke sigwait() in a loop when waiting for PAM parent, to avoid spurious wake-ups

											
										
										
											2011-06-30 04:15:39 +02:00
+								                        for (;;) {
 								                                if (sigwait(&ss, &sig) < 0) {
 								                                        if (errno == EINTR)
 								                                                continue;
 								                                        goto child_finish;
 								                                }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												execute: invoke sigwait() in a loop when waiting for PAM parent, to avoid spurious wake-ups

											
										
										
											2011-06-30 04:15:39 +02:00
+								                                assert(sig == SIGTERM);
 								                                break;
 								                        }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                }
-												execute: invoke sigwait() in a loop when waiting for PAM parent, to avoid spurious wake-ups

											
										
										
											2011-06-30 04:15:39 +02:00
+								                /* If our parent died we'll end the session */
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								                if (getppid() != parent_pid) {
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                        pam_code = pam_close_session(handle, flags);
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								                        if (pam_code != PAM_SUCCESS)
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                                goto child_finish;
-												execute.c: little modernization

											
										
										
											2013-08-28 13:54:43 +02:00
+								                }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								                ret = 0;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								        child_finish:
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                pam_end(handle, pam_code | flags);
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								                _exit(ret);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        }
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
+								        barrier_set_role(&barrier, BARRIER_PARENT);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        /* If the child was forked off successfully it will do all the
 								         * cleanups, so forget about the handle here. */
 								        handle = NULL;
-												execute: do initgroups() first, pam initialization second so that it can still modify the groups list

											
										
										
											2011-06-30 02:15:01 +02:00
+								        /* Unblock SIGTERM again in the parent */
-												everywhere: port everything to sigprocmask_many() and friends

This ports a lot of manual code over to sigprocmask_many() and friends.

Also, we now consistly check for sigprocmask() failures with
assert_se(), since the call cannot realistically fail unless there's a
programming error.

Also encloses a few sd_event_add_signal() calls with (void) when we
ignore the return values for it knowingly.

											
										
										
											2015-06-15 20:13:23 +02:00
+								        assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
 								        /* We close the log explicitly here, since the PAM modules
 								         * might have opened it, but we don't want this fd around. */
 								        closelog();
-												core: make setup_pam() synchronous

If we spawn a unit with a non-empty 'PAMName=', we fork off a
child-process _inside_ the unit, known as '(sd-pam)', which watches the
session. It waits for the main-process to exit and then finishes it via
pam_close_session(3).

However, the '(sd-pam)' setup is highly asynchronous. There is no
guarantee that process gets spawned before we finish the unit setup.
Therefore, there might be a root-owned process inside of the cgroup of
the unit, thus causing cg_migrate() to error-out with EPERM.

This patch makes setup_pam() synchronous and waits for the '(sd-pam)'
setup to finish before continuing. This guarantees that setresuid(2) was
at least tried before we continue with the child setup of the real unit.
Note that if setresuid(2) fails, we already warn loudly about it. You
really must make sure that you own the passed user if using 'PAMName='.
It seems very plausible to rely on that assumption.

											
										
										
											2015-09-23 00:51:20 +02:00
+								        /* Synchronously wait for the child to initialize. We don't care for
 								         * errors as we cannot recover. However, warn loudly if it happens. */
 								        if (!barrier_place_and_sync(&barrier))
 								                log_error("PAM initialization failed");
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        strv_free(*env);
 								        *env = e;
-												execute: properly pass PAM environment to executed process

											
										
										
											2011-06-30 04:31:34 +02:00
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        return 0;
 								fail:
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								        if (pam_code != PAM_SUCCESS) {
 								                log_error("PAM failed: %s", pam_strerror(handle, pam_code));
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								                r = -EPERM;  /* PAM errors do not map to errno */
 								        } else
 								                log_error_errno(r, "PAM failed: %m");
-												execute: make setup_pam() return -errno when possible

The only caller currently checks if the result is non-zero,
so nothing changes there.

											
										
										
											2011-11-17 00:16:22 +01:00
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        if (handle) {
 								                if (close_session)
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                        pam_code = pam_close_session(handle, flags);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												execute: more debugging messages

											
										
										
											2013-08-28 14:01:30 +02:00
+								                pam_end(handle, pam_code | flags);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        }
 								        strv_free(e);
 								        closelog();
-												core: normalize error handling a bit, in setup_pam()

Assign errno-style errors to a variable called "r" when they happen, the same way we do this in most other calls. It's
bad enough that the error handling part of the function deals with two different error variables (pam_code and r) now,
but before this fix it was even three!

											
										
										
											2016-01-22 12:06:39 +01:00
+								        return r;
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
+								#else
 								        return 0;
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								#endif
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
+								}
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												exec: include path name of binary we are about to execute when renaming forked off processes

Immediately after forking off a process change the comm name and argv[0]
to "(foobar)" where "foobar" is the basename of the path we are about to
execute.

This should be useful when charting boot progress.

											
										
										
											2012-02-01 22:33:15 +01:00
+								static void rename_process_from_path(const char *path) {
 								        char process_name[11];
 								        const char *p;
 								        size_t l;
 								        /* This resulting string must fit in 10 chars (i.e. the length
 								         * of "/sbin/init") to look pretty in /bin/ps */
-												Get rid of our reimplementation of basename

The only problem is that libgen.h #defines basename to point to it's
own broken implementation instead of the GNU one. This can be fixed
by #undefining basename.

											
										
										
											2013-12-07 03:29:55 +01:00
+								        p = basename(path);
-												exec: include path name of binary we are about to execute when renaming forked off processes

Immediately after forking off a process change the comm name and argv[0]
to "(foobar)" where "foobar" is the basename of the path we are about to
execute.

This should be useful when charting boot progress.

											
										
										
											2012-02-01 22:33:15 +01:00
+								        if (isempty(p)) {
 								                rename_process("(...)");
 								                return;
 								        }
 								        l = strlen(p);
 								        if (l > 8) {
 								                /* The end of the process name is usually more
 								                 * interesting, since the first bit might just be
 								                 * "systemd-" */
 								                p = p + l - 8;
 								                l = 8;
 								        }
 								        process_name[0] = '(';
 								        memcpy(process_name+1, p, l);
 								        process_name[1+l] = ')';
 								        process_name[1+l+1] = 0;
 								        rename_process(process_name);
 								}
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								static bool context_has_address_families(const ExecContext *c) {
 								        assert(c);
 								        return c->address_families_whitelist ||
 								                !set_isempty(c->address_families);
 								}
 								static bool context_has_syscall_filters(const ExecContext *c) {
 								        assert(c);
 								        return c->syscall_whitelist ||
 								                !set_isempty(c->syscall_filter);
 								}
 								static bool context_has_no_new_privileges(const ExecContext *c) {
 								        assert(c);
 								        if (c->no_new_privileges)
 								                return true;
 								        if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
 								                return false;
 								        /* We need NNP if we have any form of seccomp and are unprivileged */
 								        return context_has_address_families(c) ||
 								                c->memory_deny_write_execute ||
 								                c->restrict_realtime ||
 								                exec_context_restrict_namespaces_set(c) ||
 								                c->protect_kernel_tunables ||
 								                c->protect_kernel_modules ||
 								                c->private_devices ||
 								                context_has_syscall_filters(c) ||
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
+								                !set_isempty(c->syscall_archs) ||
 								                c->lock_personality;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								}
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
-												core: simplify skip_seccomp_unavailable() a bit

Let's prefer early-exit over deep-indented if blocks. Not behavioural change.

											
										
										
											2016-10-21 20:03:51 +02:00
 								        if (is_seccomp_available())
 								                return false;
 								        log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
 								        return true;
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								}
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        uint32_t negative_action, default_action, action;
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        int r;
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												syscallfilter: port to libseccomp

											
										
										
											2014-02-12 01:29:54 +01:00
+								        assert(c);
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!context_has_syscall_filters(c))
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (skip_seccomp_unavailable(u, "SystemCallFilter="))
 								                return 0;
-												seccomp: add helper call to add all secondary archs to a seccomp filter

And make use of it where appropriate for executing services and for
nspawn.

											
										
										
											2014-02-18 22:14:00 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
-												seccomp: add helper call to add all secondary archs to a seccomp filter

And make use of it where appropriate for executing services and for
nspawn.

											
										
										
											2014-02-18 22:14:00 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (c->syscall_whitelist) {
 								                default_action = negative_action;
 								                action = SCMP_ACT_ALLOW;
-												seccomp: we should control NO_NEW_PRIVS on our own, not let seccomp do this for us

											
										
										
											2014-02-25 20:32:27 +01:00
+								        } else {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                default_action = SCMP_ACT_ALLOW;
 								                action = negative_action;
-												core: add SystemCallArchitectures= unit setting to allow disabling of non-native
architecture support for system calls

Also, turn system call filter bus properties into complex types instead
of concatenated strings.

											
										
										
											2014-02-13 00:24:00 +01:00
+								        }
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_ambient_hack) {
 								                r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
 								                if (r < 0)
 								                        return r;
 								        }
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
+								}
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
 								        assert(u);
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
+								        assert(c);
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (set_isempty(c->syscall_archs))
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
 								                return 0;
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_restrict_archs(c->syscall_archs);
 								}
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								static int apply_address_families(const Unit* u, const ExecContext *c) {
 								        assert(u);
 								        assert(c);
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!context_has_address_families(c))
 								                return 0;
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
 								                return 0;
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
+								}
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
+								        assert(c);
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!c->memory_deny_write_execute)
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
 								                return 0;
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_memory_deny_write_execute();
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
+								}
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								        assert(c);
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!c->restrict_realtime)
-												core: do not fail at step SECCOMP if there is no kernel support (#4004)

Fixes #3882
											
										
										
											2016-08-22 21:40:58 +02:00
+								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (skip_seccomp_unavailable(u, "RestrictRealtime="))
 								                return 0;
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_restrict_realtime();
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								}
-												core: make unit argument const for apply seccomp functions

											
										
										
											2016-10-27 09:39:20 +02:00
+								static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								        assert(c);
 								        /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
 								         * let's protect even those systems where this is left on in the kernel. */
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!c->protect_kernel_tunables)
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
 								                return 0;
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_protect_sysctl();
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								}
-												core: make unit argument const for apply seccomp functions

											
										
										
											2016-10-27 09:39:20 +02:00
+								static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												core:sandbox: Add ProtectKernelModules= option

This is useful to turn off explicit module load and unload operations on modular
kernels. This option removes CAP_SYS_MODULE from the capability bounding set for
the unit, and installs a system call filter to block module system calls.

This option will not prevent the kernel from loading modules using the module
auto-load feature which is a system wide operation.

											
										
										
											2016-10-12 13:31:21 +02:00
+								        assert(c);
-												core: rework apply_protect_kernel_modules() to use seccomp_add_syscall_filter_set()

Let's simplify this call, by making use of the new infrastructure.

This is actually more in line with Djalal's original patch but instead of
search the filter set in the array by its name we can now use the set index and
jump directly to it.

											
										
										
											2016-10-21 20:12:33 +02:00
+								        /* Turn off module syscalls on ProtectKernelModules=yes */
-												core:sandbox: Add ProtectKernelModules= option

This is useful to turn off explicit module load and unload operations on modular
kernels. This option removes CAP_SYS_MODULE from the capability bounding set for
the unit, and installs a system call filter to block module system calls.

This option will not prevent the kernel from loading modules using the module
auto-load feature which is a system wide operation.

											
										
										
											2016-10-12 13:31:21 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!c->protect_kernel_modules)
 								                return 0;
-												core:sandbox: Add ProtectKernelModules= option

This is useful to turn off explicit module load and unload operations on modular
kernels. This option removes CAP_SYS_MODULE from the capability bounding set for
the unit, and installs a system call filter to block module system calls.

This option will not prevent the kernel from loading modules using the module
auto-load feature which is a system wide operation.

											
										
										
											2016-10-12 13:31:21 +02:00
+								        if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
 								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
-												core:sandbox: Add ProtectKernelModules= option

This is useful to turn off explicit module load and unload operations on modular
kernels. This option removes CAP_SYS_MODULE from the capability bounding set for
the unit, and installs a system call filter to block module system calls.

This option will not prevent the kernel from loading modules using the module
auto-load feature which is a system wide operation.

											
										
										
											2016-10-12 13:31:21 +02:00
+								}
-												core: make unit argument const for apply seccomp functions

											
										
										
											2016-10-27 09:39:20 +02:00
+								static int apply_private_devices(const Unit *u, const ExecContext *c) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												execute: filter low-level I/O syscalls if PrivateDevices= is set

If device access is restricted via PrivateDevices=, let's also block the
various low-level I/O syscalls at the same time, so that we know that the
minimal set of devices in our virtualized /dev are really everything the unit
can access.

											
										
										
											2016-08-26 16:39:04 +02:00
+								        assert(c);
-												core: Use @raw-io syscall group to filter I/O syscalls when PrivateDevices= is set

Instead of having a local syscall list, use the @raw-io group which
contains the same set of syscalls to filter.

											
										
										
											2016-09-25 12:52:27 +02:00
+								        /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
-												execute: filter low-level I/O syscalls if PrivateDevices= is set

If device access is restricted via PrivateDevices=, let's also block the
various low-level I/O syscalls at the same time, so that we know that the
minimal set of devices in our virtualized /dev are really everything the unit
can access.

											
										
										
											2016-08-26 16:39:04 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        if (!c->private_devices)
 								                return 0;
-												execute: filter low-level I/O syscalls if PrivateDevices= is set

If device access is restricted via PrivateDevices=, let's also block the
various low-level I/O syscalls at the same time, so that we know that the
minimal set of devices in our virtualized /dev are really everything the unit
can access.

											
										
										
											2016-08-26 16:39:04 +02:00
+								        if (skip_seccomp_unavailable(u, "PrivateDevices="))
 								                return 0;
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
-												execute: filter low-level I/O syscalls if PrivateDevices= is set

If device access is restricted via PrivateDevices=, let's also block the
various low-level I/O syscalls at the same time, so that we know that the
minimal set of devices in our virtualized /dev are really everything the unit
can access.

											
										
										
											2016-08-26 16:39:04 +02:00
+								}
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								        assert(u);
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								        assert(c);
 								        if (!exec_context_restrict_namespaces_set(c))
 								                return 0;
 								        if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
 								                return 0;
 								        return seccomp_restrict_namespaces(c->restrict_namespaces);
 								}
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
+								static int apply_lock_personality(const Unit* u, const ExecContext *c) {
-												seccomp: default to something resembling the current personality when locking it

Let's lock the personality to the currently set one, if nothing is
specifically specified. But do so with a grain of salt, and never
default to any exotic personality here, but only PER_LINUX or
PER_LINUX32.

											
										
										
											2017-08-09 20:40:26 +02:00
+								        unsigned long personality;
 								        int r;
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
 								        assert(u);
 								        assert(c);
 								        if (!c->lock_personality)
 								                return 0;
 								        if (skip_seccomp_unavailable(u, "LockPersonality="))
 								                return 0;
-												seccomp: default to something resembling the current personality when locking it

Let's lock the personality to the currently set one, if nothing is
specifically specified. But do so with a grain of salt, and never
default to any exotic personality here, but only PER_LINUX or
PER_LINUX32.

											
										
										
											2017-08-09 20:40:26 +02:00
+								        personality = c->personality;
 								        /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
 								        if (personality == PERSONALITY_INVALID) {
 								                r = opinionated_personality(&personality);
 								                if (r < 0)
 								                        return r;
 								        }
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
 								        return seccomp_lock_personality(personality);
 								}
-												syscallfilter: port to libseccomp

											
										
										
											2014-02-12 01:29:54 +01:00
+								#endif
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
+								static void do_idle_pipe_dance(int idle_pipe[4]) {
 								        assert(idle_pipe);
-												execute: invalidate idle pipe after use

Not strictly necessary, but makes clear the fds are invalidated. Make
sure we do the same here as in most other cases.

											
										
										
											2015-09-11 18:14:11 +02:00
+								        idle_pipe[1] = safe_close(idle_pipe[1]);
 								        idle_pipe[2] = safe_close(idle_pipe[2]);
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
 								        if (idle_pipe[0] >= 0) {
 								                int r;
 								                r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
 								                if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
-												execute: fix return type from write()

											
										
										
											2015-09-11 18:15:08 +02:00
+								                        ssize_t n;
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
+								                        /* Signal systemd that we are bored and want to continue. */
-												execute: fix return type from write()

											
										
										
											2015-09-11 18:15:08 +02:00
+								                        n = write(idle_pipe[3], "x", 1);
 								                        if (n > 0)
-												core: don't wait for reply if writing to pipe fails

This shouldn't really happen, but it's seems cleaner to
continue on error.

CID #1237552.

											
										
										
											2015-03-14 03:20:53 +01:00
+								                                /* Wait for systemd to react to the signal above. */
 								                                fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
+								                }
-												execute: invalidate idle pipe after use

Not strictly necessary, but makes clear the fds are invalidated. Make
sure we do the same here as in most other cases.

											
										
										
											2015-09-11 18:14:11 +02:00
+								                idle_pipe[0] = safe_close(idle_pipe[0]);
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
 								        }
-												execute: invalidate idle pipe after use

Not strictly necessary, but makes clear the fds are invalidated. Make
sure we do the same here as in most other cases.

											
										
										
											2015-09-11 18:14:11 +02:00
+								        idle_pipe[3] = safe_close(idle_pipe[3]);
-												systemd: do not output status messages once gettys are running

Make Type=idle communication bidirectional: when bootup is finished,
the manager, as before, signals idling Type=idle jobs to continue.
However, if the boot takes too long, idling jobs signal the manager
that they have had enough, wait a tiny bit more, and continue, taking
ownership of the console. The manager, when signalled that Type=idle
jobs are done, makes a note and will not write to the console anymore.

This is a cosmetic issue, but quite noticable, so let's just fix it.

Based on Harald Hoyer's patch.

https://bugs.freedesktop.org/show_bug.cgi?id=54247
http://unix.stackexchange.com/questions/51805/systemd-messages-after-starting-login/

											
										
										
											2013-07-16 03:34:57 +02:00
+								}
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								static int build_environment(
-												core: bypass dynamic user lookups from dbus-daemon

dbus-daemon does NSS name look-ups in order to enforce its bus policy. This
might dead-lock if an NSS module use wants to use D-Bus for the look-up itself,
like our nss-systemd does. Let's work around this by bypassing bus
communication in the NSS module if we run inside of dbus-daemon. To make this
work we keep a bit of extra state in /run/systemd/dynamic-uid/ so that we don't
have to consult the bus, but can still resolve the names.

Note that the normal codepath continues to be via the bus, so that resolving
works from all mount namespaces and is subject to authentication, as before.

This is a bit dirty, but not too dirty, as dbus daemon is kinda special anyway
for PID 1.

											
										
										
											2016-08-02 12:28:51 +02:00
+								                Unit *u,
-												exec: factor out most function arguments of exec_spawn() to ExecParameters

This way, the list of arguments to that function gets more comprehensive,
and we can get around passing lots of NULL and 0 arguments from socket.c,
swap.c and mount.c.

It also allows for splitting up the code in exec_spawn().

While at it, make ExecContext const in execute.c.

											
										
										
											2014-08-23 15:28:37 +02:00
+								                const ExecContext *c,
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                const ExecParameters *p,
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								                unsigned n_fds,
 								                const char *home,
 								                const char *username,
 								                const char *shell,
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								                dev_t journal_stream_dev,
 								                ino_t journal_stream_ino,
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								                char ***ret) {
 								        _cleanup_strv_free_ char **our_env = NULL;
 								        unsigned n_env = 0;
 								        char *x;
-												core: add "invocation ID" concept to service manager

This adds a new invocation ID concept to the service manager. The invocation ID
identifies each runtime cycle of a unit uniquely. A new randomized 128bit ID is
generated each time a unit moves from and inactive to an activating or active
state.

The primary usecase for this concept is to connect the runtime data PID 1
maintains about a service with the offline data the journal stores about it.
Previously we'd use the unit name plus start/stop times, which however is
highly racy since the journal will generally process log data after the service
already ended.

The "invocation ID" kinda matches the "boot ID" concept of the Linux kernel,
except that it applies to an individual unit instead of the whole system.

The invocation ID is passed to the activated processes as environment variable.
It is additionally stored as extended attribute on the cgroup of the unit. The
latter is used by journald to automatically retrieve it for each log logged
message and attach it to the log entry. The environment variable is very easily
accessible, even for unprivileged services. OTOH the extended attribute is only
accessible to privileged processes (this is because cgroupfs only supports the
"trusted." xattr namespace, not "user."). The environment variable may be
altered by services, the extended attribute may not be, hence is the better
choice for the journal.

Note that reading the invocation ID off the extended attribute from journald is
racy, similar to the way reading the unit name for a logging process is.

This patch adds APIs to read the invocation ID to sd-id128:
sd_id128_get_invocation() may be used in a similar fashion to
sd_id128_get_boot().

PID1's own logging is updated to always include the invocation ID when it logs
information about a unit.

A new bus call GetUnitByInvocationID() is added that allows retrieving a bus
path to a unit by its invocation ID. The bus path is built using the invocation
ID, thus providing a path for referring to a unit that is valid only for the
current runtime cycleof it.

Outlook for the future: should the kernel eventually allow passing of cgroup
information along AF_UNIX/SOCK_DGRAM messages via a unique cgroup id, then we
can alter the invocation ID to be generated as hash from that rather than
entirely randomly. This way we can derive the invocation race-freely from the
messages.

											
										
										
											2016-08-30 23:18:46 +02:00
+								        assert(u);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								        assert(c);
 								        assert(ret);
-												core: add "invocation ID" concept to service manager

This adds a new invocation ID concept to the service manager. The invocation ID
identifies each runtime cycle of a unit uniquely. A new randomized 128bit ID is
generated each time a unit moves from and inactive to an activating or active
state.

The primary usecase for this concept is to connect the runtime data PID 1
maintains about a service with the offline data the journal stores about it.
Previously we'd use the unit name plus start/stop times, which however is
highly racy since the journal will generally process log data after the service
already ended.

The "invocation ID" kinda matches the "boot ID" concept of the Linux kernel,
except that it applies to an individual unit instead of the whole system.

The invocation ID is passed to the activated processes as environment variable.
It is additionally stored as extended attribute on the cgroup of the unit. The
latter is used by journald to automatically retrieve it for each log logged
message and attach it to the log entry. The environment variable is very easily
accessible, even for unprivileged services. OTOH the extended attribute is only
accessible to privileged processes (this is because cgroupfs only supports the
"trusted." xattr namespace, not "user."). The environment variable may be
altered by services, the extended attribute may not be, hence is the better
choice for the journal.

Note that reading the invocation ID off the extended attribute from journald is
racy, similar to the way reading the unit name for a logging process is.

This patch adds APIs to read the invocation ID to sd-id128:
sd_id128_get_invocation() may be used in a similar fashion to
sd_id128_get_boot().

PID1's own logging is updated to always include the invocation ID when it logs
information about a unit.

A new bus call GetUnitByInvocationID() is added that allows retrieving a bus
path to a unit by its invocation ID. The bus path is built using the invocation
ID, thus providing a path for referring to a unit that is valid only for the
current runtime cycleof it.

Outlook for the future: should the kernel eventually allow passing of cgroup
information along AF_UNIX/SOCK_DGRAM messages via a unique cgroup id, then we
can alter the invocation ID to be generated as hash from that rather than
entirely randomly. This way we can derive the invocation race-freely from the
messages.

											
										
										
											2016-08-30 23:18:46 +02:00
+								        our_env = new0(char*, 14);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								        if (!our_env)
 								                return -ENOMEM;
 								        if (n_fds > 0) {
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								                _cleanup_free_ char *joined = NULL;
-												tree-wide: make use of getpid_cached() wherever we can

This moves pretty much all uses of getpid() over to getpid_raw(). I
didn't specifically check whether the optimization is worth it for each
replacement, but in order to keep things simple and systematic I
switched over everything at once.

											
										
										
											2017-07-20 16:19:18 +02:00
+								                if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								                if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                joined = strv_join(p->fd_names, ":");
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								                if (!joined)
 								                        return -ENOMEM;
-												tree-wide: drop NULL sentinel from strjoin

This makes strjoin and strjoina more similar and avoids the useless final
argument.

spatch -I . -I ./src -I ./src/basic -I ./src/basic -I ./src/shared -I ./src/shared -I ./src/network -I ./src/locale -I ./src/login -I ./src/journal -I ./src/journal -I ./src/timedate -I ./src/timesync -I ./src/nspawn -I ./src/resolve -I ./src/resolve -I ./src/systemd -I ./src/core -I ./src/core -I ./src/libudev -I ./src/udev -I ./src/udev/net -I ./src/udev -I ./src/libsystemd/sd-bus -I ./src/libsystemd/sd-event -I ./src/libsystemd/sd-login -I ./src/libsystemd/sd-netlink -I ./src/libsystemd/sd-network -I ./src/libsystemd/sd-hwdb -I ./src/libsystemd/sd-device -I ./src/libsystemd/sd-id128 -I ./src/libsystemd-network --sp-file coccinelle/strjoin.cocci --in-place $(git ls-files src/*.c)

git grep -e '\bstrjoin\b.*NULL' -l|xargs sed -i -r 's/strjoin\((.*), NULL\)/strjoin(\1)/'

This might have missed a few cases (spatch has a really hard time dealing
with _cleanup_ macros), but that's no big issue, they can always be fixed
later.

											
										
										
											2016-10-23 17:43:27 +02:00
+								                x = strjoin("LISTEN_FDNAMES=", joined);
-												core: add support for naming file descriptors passed using socket activation

This adds support for naming file descriptors passed using socket
activation. The names are passed in a new $LISTEN_FDNAMES= environment
variable, that matches the existign $LISTEN_FDS= one and contains a
colon-separated list of names.

This also adds support for naming fds submitted to the per-service fd
store using FDNAME= in the sd_notify() message.

This also adds a new FileDescriptorName= setting for socket unit files
to set the name for fds created by socket units.

This also adds a new call sd_listen_fds_with_names(), that is similar to
sd_listen_fds(), but also returns the names of the fds.

systemd-activate gained the new --fdname= switch to specify a name for
testing socket activation.

This is based on #1247 by Maciej Wereski.

Fixes #1247.

											
										
										
											2015-10-04 17:36:19 +02:00
+								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								        }
-												core: only set the watchdog variables in ExecStart= lines

											
										
										
											2016-08-04 22:11:29 +02:00
+								        if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
-												tree-wide: make use of getpid_cached() wherever we can

This moves pretty much all uses of getpid() over to getpid_raw(). I
didn't specifically check whether the optimization is worth it for each
replacement, but in order to keep things simple and systematic I
switched over everything at once.

											
										
										
											2017-07-20 16:19:18 +02:00
+								                if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
-												sd-daemon: introduce sd_watchdog_enabled() for parsing $WATCHDOG_USEC

Also, introduce a new environment variable named $WATCHDOG_PID which
cotnains the PID of the process that is supposed to send the keep-alive
events. This is similar how $LISTEN_FDS and $LISTEN_PID work together,
and protects against confusing processes further down the process tree
due to inherited environment.

											
										
										
											2013-12-22 22:14:05 +01:00
+								                        return -ENOMEM;
 								                our_env[n_env++] = x;
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
-												sd-daemon: introduce sd_watchdog_enabled() for parsing $WATCHDOG_USEC

Also, introduce a new environment variable named $WATCHDOG_PID which
cotnains the PID of the process that is supposed to send the keep-alive
events. This is similar how $LISTEN_FDS and $LISTEN_PID work together,
and protects against confusing processes further down the process tree
due to inherited environment.

											
										
										
											2013-12-22 22:14:05 +01:00
+								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												core: bypass dynamic user lookups from dbus-daemon

dbus-daemon does NSS name look-ups in order to enforce its bus policy. This
might dead-lock if an NSS module use wants to use D-Bus for the look-up itself,
like our nss-systemd does. Let's work around this by bypassing bus
communication in the NSS module if we run inside of dbus-daemon. To make this
work we keep a bit of extra state in /run/systemd/dynamic-uid/ so that we don't
have to consult the bus, but can still resolve the names.

Note that the normal codepath continues to be via the bus, so that resolving
works from all mount namespaces and is subject to authentication, as before.

This is a bit dirty, but not too dirty, as dbus daemon is kinda special anyway
for PID 1.

											
										
										
											2016-08-02 12:28:51 +02:00
+								        /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
 								         * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
 								         * check the database directly. */
-												execute: also control the SYSTEMD_NSS_BYPASS_BUS through an ExecFlags field

Also, correct the logic while we are at it: the variable is only
required for system services, not user services.

											
										
										
											2017-08-01 10:43:04 +02:00
+								        if (p->flags & EXEC_NSS_BYPASS_BUS) {
-												core: bypass dynamic user lookups from dbus-daemon

dbus-daemon does NSS name look-ups in order to enforce its bus policy. This
might dead-lock if an NSS module use wants to use D-Bus for the look-up itself,
like our nss-systemd does. Let's work around this by bypassing bus
communication in the NSS module if we run inside of dbus-daemon. To make this
work we keep a bit of extra state in /run/systemd/dynamic-uid/ so that we don't
have to consult the bus, but can still resolve the names.

Note that the normal codepath continues to be via the bus, so that resolving
works from all mount namespaces and is subject to authentication, as before.

This is a bit dirty, but not too dirty, as dbus daemon is kinda special anyway
for PID 1.

											
										
										
											2016-08-02 12:28:51 +02:00
+								                x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
 								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								        if (home) {
 								                x = strappend("HOME=", home);
 								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
 								        if (username) {
 								                x = strappend("LOGNAME=", username);
 								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								                x = strappend("USER=", username);
 								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
 								        if (shell) {
 								                x = strappend("SHELL=", shell);
 								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												core: add "invocation ID" concept to service manager

This adds a new invocation ID concept to the service manager. The invocation ID
identifies each runtime cycle of a unit uniquely. A new randomized 128bit ID is
generated each time a unit moves from and inactive to an activating or active
state.

The primary usecase for this concept is to connect the runtime data PID 1
maintains about a service with the offline data the journal stores about it.
Previously we'd use the unit name plus start/stop times, which however is
highly racy since the journal will generally process log data after the service
already ended.

The "invocation ID" kinda matches the "boot ID" concept of the Linux kernel,
except that it applies to an individual unit instead of the whole system.

The invocation ID is passed to the activated processes as environment variable.
It is additionally stored as extended attribute on the cgroup of the unit. The
latter is used by journald to automatically retrieve it for each log logged
message and attach it to the log entry. The environment variable is very easily
accessible, even for unprivileged services. OTOH the extended attribute is only
accessible to privileged processes (this is because cgroupfs only supports the
"trusted." xattr namespace, not "user."). The environment variable may be
altered by services, the extended attribute may not be, hence is the better
choice for the journal.

Note that reading the invocation ID off the extended attribute from journald is
racy, similar to the way reading the unit name for a logging process is.

This patch adds APIs to read the invocation ID to sd-id128:
sd_id128_get_invocation() may be used in a similar fashion to
sd_id128_get_boot().

PID1's own logging is updated to always include the invocation ID when it logs
information about a unit.

A new bus call GetUnitByInvocationID() is added that allows retrieving a bus
path to a unit by its invocation ID. The bus path is built using the invocation
ID, thus providing a path for referring to a unit that is valid only for the
current runtime cycleof it.

Outlook for the future: should the kernel eventually allow passing of cgroup
information along AF_UNIX/SOCK_DGRAM messages via a unique cgroup id, then we
can alter the invocation ID to be generated as hash from that rather than
entirely randomly. This way we can derive the invocation race-freely from the
messages.

											
										
										
											2016-08-30 23:18:46 +02:00
+								        if (!sd_id128_is_null(u->invocation_id)) {
 								                if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												core: inherit TERM from PID 1 for all services started on /dev/console

This way, invoking nspawn from a shell in the best case inherits the TERM
setting all the way down into the login shell spawned in the container.

Fixes: #3697

											
										
										
											2016-07-27 15:25:55 +02:00
+								        if (exec_context_needs_term(c)) {
 								                const char *tty_path, *term = NULL;
 								                tty_path = exec_context_tty_path(c);
 								                /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
 								                 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
 								                 * passes to PID 1 ends up all the way in the console login shown. */
 								                if (path_equal(tty_path, "/dev/console") && getppid() == 1)
 								                        term = getenv("TERM");
 								                if (!term)
 								                        term = default_term_for_tty(tty_path);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
-												core: inherit TERM from PID 1 for all services started on /dev/console

This way, invoking nspawn from a shell in the best case inherits the TERM
setting all the way down into the login shell spawned in the container.

Fixes: #3697

											
										
										
											2016-07-27 15:25:55 +02:00
+								                x = strappend("TERM=", term);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								                if (!x)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								        if (journal_stream_dev != 0 && journal_stream_ino != 0) {
 								                if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
 								                        return -ENOMEM;
 								                our_env[n_env++] = x;
 								        }
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
+								        our_env[n_env++] = NULL;
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								        assert(n_env <= 12);
-												execute: set TERM even if we don't open the tty on our own

This way, when a tty path is configured TERM is set, which is nice to
set a useful term for gettys.

											
										
										
											2013-12-18 17:41:16 +01:00
 								        *ret = our_env;
 								        our_env = NULL;
 								        return 0;
 								}
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
+								static int build_pass_environment(const ExecContext *c, char ***ret) {
 								        _cleanup_strv_free_ char **pass_env = NULL;
 								        size_t n_env = 0, n_bufsize = 0;
 								        char **i;
 								        STRV_FOREACH(i, c->pass_environment) {
 								                _cleanup_free_ char *x = NULL;
 								                char *v;
 								                v = getenv(*i);
 								                if (!v)
 								                        continue;
-												tree-wide: drop NULL sentinel from strjoin

This makes strjoin and strjoina more similar and avoids the useless final
argument.

spatch -I . -I ./src -I ./src/basic -I ./src/basic -I ./src/shared -I ./src/shared -I ./src/network -I ./src/locale -I ./src/login -I ./src/journal -I ./src/journal -I ./src/timedate -I ./src/timesync -I ./src/nspawn -I ./src/resolve -I ./src/resolve -I ./src/systemd -I ./src/core -I ./src/core -I ./src/libudev -I ./src/udev -I ./src/udev/net -I ./src/udev -I ./src/libsystemd/sd-bus -I ./src/libsystemd/sd-event -I ./src/libsystemd/sd-login -I ./src/libsystemd/sd-netlink -I ./src/libsystemd/sd-network -I ./src/libsystemd/sd-hwdb -I ./src/libsystemd/sd-device -I ./src/libsystemd/sd-id128 -I ./src/libsystemd-network --sp-file coccinelle/strjoin.cocci --in-place $(git ls-files src/*.c)

git grep -e '\bstrjoin\b.*NULL' -l|xargs sed -i -r 's/strjoin\((.*), NULL\)/strjoin(\1)/'

This might have missed a few cases (spatch has a really hard time dealing
with _cleanup_ macros), but that's no big issue, they can always be fixed
later.

											
										
										
											2016-10-23 17:43:27 +02:00
+								                x = strjoin(*i, "=", v);
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
+								                if (!x)
 								                        return -ENOMEM;
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
+								                if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
 								                        return -ENOMEM;
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
+								                pass_env[n_env++] = x;
 								                pass_env[n_env] = NULL;
 								                x = NULL;
 								        }
 								        *ret = pass_env;
 								        pass_env = NULL;
 								        return 0;
 								}
-												core: make exec code a bit more readable

Let's add a function that checks whether we need fs namespacing, to make
things easier to read, instead of using a humungous if expression...

											
										
										
											2015-05-13 16:34:02 +02:00
+								static bool exec_needs_mount_namespace(
 								                const ExecContext *context,
 								                const ExecParameters *params,
 								                ExecRuntime *runtime) {
 								        assert(context);
 								        assert(params);
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								        if (context->root_image)
 								                return true;
-												doc,core: Read{Write,Only}Paths= and InaccessiblePaths=

This patch renames Read{Write,Only}Directories= and InaccessibleDirectories=
to Read{Write,Only}Paths= and InaccessiblePaths=, previous names are kept
as aliases but they are not advertised in the documentation.

Renamed variables:
`read_write_dirs` --> `read_write_paths`
`read_only_dirs` --> `read_only_paths`
`inaccessible_dirs` --> `inaccessible_paths`

											
										
										
											2016-07-07 11:17:00 +02:00
+								        if (!strv_isempty(context->read_write_paths) ||
 								            !strv_isempty(context->read_only_paths) ||
 								            !strv_isempty(context->inaccessible_paths))
-												core: make exec code a bit more readable

Let's add a function that checks whether we need fs namespacing, to make
things easier to read, instead of using a humungous if expression...

											
										
										
											2015-05-13 16:34:02 +02:00
+								                return true;
-												core: add ability to define arbitrary bind mounts for services

This adds two new settings BindPaths= and BindReadOnlyPaths=. They allow
defining arbitrary bind mounts specific to particular services. This is
particularly useful for services with RootDirectory= set as this permits making
specific bits of the host directory available to chrooted services.

The two new settings follow the concepts nspawn already possess in --bind= and
--bind-ro=, as well as the .nspawn settings Bind= and BindReadOnly= (and these
latter options should probably be renamed to BindPaths= and BindReadOnlyPaths=
too).

Fixes: #3439

											
										
										
											2016-11-23 22:21:40 +01:00
+								        if (context->n_bind_mounts > 0)
 								                return true;
-												core: make exec code a bit more readable

Let's add a function that checks whether we need fs namespacing, to make
things easier to read, instead of using a humungous if expression...

											
										
										
											2015-05-13 16:34:02 +02:00
+								        if (context->mount_flags != 0)
 								                return true;
 								        if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
 								                return true;
 								        if (context->private_devices ||
 								            context->protect_system != PROTECT_SYSTEM_NO ||
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								            context->protect_home != PROTECT_HOME_NO ||
 								            context->protect_kernel_tunables ||
-												core:sandbox: lets make /lib/modules/ inaccessible on ProtectKernelModules=

Lets go further and make /lib/modules/ inaccessible for services that do
not have business with modules, this is a minor improvment but it may
help on setups with custom modules and they are limited... in regard of
kernel auto-load feature.

This change introduce NameSpaceInfo struct which we may embed later
inside ExecContext but for now lets just reduce the argument number to
setup_namespace() and merge ProtectKernelModules feature.

											
										
										
											2016-10-12 14:11:16 +02:00
+								            context->protect_kernel_modules ||
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								            context->protect_control_groups)
-												core: make exec code a bit more readable

Let's add a function that checks whether we need fs namespacing, to make
things easier to read, instead of using a humungous if expression...

											
										
										
											2015-05-13 16:34:02 +02:00
+								                return true;
-												namespace: Apply MountAPIVFS= only when a Root directory is set

The MountAPIVFS= documentation says that this options has no effect
unless used in conjunction with RootDirectory= or RootImage= ,lets fix
this and avoid to create private mount namespaces where it is not
needed.

											
										
										
											2017-03-05 21:39:43 +01:00
+								        if (context->mount_apivfs && (context->root_image || context->root_directory))
-												core: add a per-unit setting MountAPIVFS= for mounting /dev, /proc, /sys in conjunction with RootDirectory=

This adds a boolean unit file setting MountAPIVFS=. If set, the three
main API VFS mounts will be mounted for the service. This only has an
effect on RootDirectory=, which it makes a ton times more useful.

(This is basically the /dev + /proc + /sys mounting code posted in the
original #4727, but rebased on current git, and with the automatic logic
replaced by explicit logic controlled by a unit file setting)

											
										
										
											2016-12-22 23:34:35 +01:00
+								                return true;
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								        if (context->dynamic_user &&
 								            (!strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) ||
 								             !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
 								             !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
 								             !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
 								                return true;
-												core: make exec code a bit more readable

Let's add a function that checks whether we need fs namespacing, to make
things easier to read, instead of using a humungous if expression...

											
										
										
											2015-05-13 16:34:02 +02:00
+								        return false;
 								}
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								static int setup_private_users(uid_t uid, gid_t gid) {
 								        _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
 								        _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
 								        _cleanup_close_ int unshare_ready_fd = -1;
 								        _cleanup_(sigkill_waitp) pid_t pid = 0;
 								        uint64_t c = 1;
 								        siginfo_t si;
 								        ssize_t n;
 								        int r;
 								        /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
 								         * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
 								         * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
 								         * which waits for the parent to create the new user namespace while staying in the original namespace. The
 								         * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
 								         * continues execution normally. */
-												core/execute.c: check asprintf return value in the usual fashion

This is unlikely to fail, but we cannot rely on asprintf return value
on failure, so let's just be correct here.

CID #1368227.

											
										
										
											2017-01-31 17:31:47 +01:00
+								        if (uid != 0 && uid_is_valid(uid)) {
 								                r = asprintf(&uid_map,
 								                             "0 0 1\n"                      /* Map root → root */
 								                             UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
 								                             uid, uid);
 								                if (r < 0)
 								                        return -ENOMEM;
 								        } else {
-												core: move misplaced comment to the right place

											
										
										
											2016-10-21 20:05:49 +02:00
+								                uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
-												core/execute.c: check asprintf return value in the usual fashion

This is unlikely to fail, but we cannot rely on asprintf return value
on failure, so let's just be correct here.

CID #1368227.

											
										
										
											2017-01-31 17:31:47 +01:00
+								                if (!uid_map)
 								                        return -ENOMEM;
 								        }
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
-												core/execute.c: check asprintf return value in the usual fashion

This is unlikely to fail, but we cannot rely on asprintf return value
on failure, so let's just be correct here.

CID #1368227.

											
										
										
											2017-01-31 17:31:47 +01:00
+								        if (gid != 0 && gid_is_valid(gid)) {
 								                r = asprintf(&gid_map,
 								                             "0 0 1\n"                      /* Map root → root */
 								                             GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
 								                             gid, gid);
 								                if (r < 0)
 								                        return -ENOMEM;
 								        } else {
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								                gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
-												core/execute.c: check asprintf return value in the usual fashion

This is unlikely to fail, but we cannot rely on asprintf return value
on failure, so let's just be correct here.

CID #1368227.

											
										
										
											2017-01-31 17:31:47 +01:00
+								                if (!gid_map)
 								                        return -ENOMEM;
 								        }
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
 								        /* Create a communication channel so that the parent can tell the child when it finished creating the user
 								         * namespace. */
 								        unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
 								        if (unshare_ready_fd < 0)
 								                return -errno;
 								        /* Create a communication channel so that the child can tell the parent a proper error code in case it
 								         * failed. */
 								        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
 								                return -errno;
 								        pid = fork();
 								        if (pid < 0)
 								                return -errno;
 								        if (pid == 0) {
 								                _cleanup_close_ int fd = -1;
 								                const char *a;
 								                pid_t ppid;
 								                /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
 								                 * here, after the parent opened its own user namespace. */
 								                ppid = getppid();
 								                errno_pipe[0] = safe_close(errno_pipe[0]);
 								                /* Wait until the parent unshared the user namespace */
 								                if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
 								                        r = -errno;
 								                        goto child_fail;
 								                }
 								                /* Disable the setgroups() system call in the child user namespace, for good. */
 								                a = procfs_file_alloca(ppid, "setgroups");
 								                fd = open(a, O_WRONLY|O_CLOEXEC);
 								                if (fd < 0) {
 								                        if (errno != ENOENT) {
 								                                r = -errno;
 								                                goto child_fail;
 								                        }
 								                        /* If the file is missing the kernel is too old, let's continue anyway. */
 								                } else {
 								                        if (write(fd, "deny\n", 5) < 0) {
 								                                r = -errno;
 								                                goto child_fail;
 								                        }
 								                        fd = safe_close(fd);
 								                }
 								                /* First write the GID map */
 								                a = procfs_file_alloca(ppid, "gid_map");
 								                fd = open(a, O_WRONLY|O_CLOEXEC);
 								                if (fd < 0) {
 								                        r = -errno;
 								                        goto child_fail;
 								                }
 								                if (write(fd, gid_map, strlen(gid_map)) < 0) {
 								                        r = -errno;
 								                        goto child_fail;
 								                }
 								                fd = safe_close(fd);
 								                /* The write the UID map */
 								                a = procfs_file_alloca(ppid, "uid_map");
 								                fd = open(a, O_WRONLY|O_CLOEXEC);
 								                if (fd < 0) {
 								                        r = -errno;
 								                        goto child_fail;
 								                }
 								                if (write(fd, uid_map, strlen(uid_map)) < 0) {
 								                        r = -errno;
 								                        goto child_fail;
 								                }
 								                _exit(EXIT_SUCCESS);
 								        child_fail:
 								                (void) write(errno_pipe[1], &r, sizeof(r));
 								                _exit(EXIT_FAILURE);
 								        }
 								        errno_pipe[1] = safe_close(errno_pipe[1]);
 								        if (unshare(CLONE_NEWUSER) < 0)
 								                return -errno;
 								        /* Let the child know that the namespace is ready now */
 								        if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
 								                return -errno;
 								        /* Try to read an error code from the child */
 								        n = read(errno_pipe[0], &r, sizeof(r));
 								        if (n < 0)
 								                return -errno;
 								        if (n == sizeof(r)) { /* an error code was sent to us */
 								                if (r < 0)
 								                        return r;
 								                return -EIO;
 								        }
 								        if (n != 0) /* on success we should have read 0 bytes */
 								                return -EIO;
 								        r = wait_for_terminate(pid, &si);
 								        if (r < 0)
 								                return r;
 								        pid = 0;
 								        /* If something strange happened with the child, let's consider this fatal, too */
 								        if (si.si_code != CLD_EXITED || si.si_status != 0)
 								                return -EIO;
 								        return 0;
 								}
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								static int setup_exec_directory(
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
+								                const ExecContext *context,
 								                const ExecParameters *params,
 								                uid_t uid,
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                gid_t gid,
 								                ExecDirectoryType type,
 								                int *exit_status) {
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								        static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
 								                [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
 								                [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
 								                [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
 								                [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
 								        };
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
+								        char **rt;
 								        int r;
 								        assert(context);
 								        assert(params);
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								        assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        assert(exit_status);
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        if (!params->prefix[type])
 								                return 0;
-												execute: add one more ExecFlags flag, for controlling unconditional directory chowning

Let's decouple the Manager object from the execution logic a bit more
here too, and simply pass along the fact whether we should
unconditionally chown the runtime/... directories via the ExecFlags
field too.

											
										
										
											2017-08-01 10:35:10 +02:00
+								        if (params->flags & EXEC_CHOWN_DIRECTORIES) {
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                if (!uid_is_valid(uid))
 								                        uid = 0;
 								                if (!gid_is_valid(gid))
 								                        gid = 0;
 								        }
 								        STRV_FOREACH(rt, context->directories[type].paths) {
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                _cleanup_free_ char *p = NULL, *pp = NULL;
 								                const char *effective;
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                p = strjoin(params->prefix[type], "/", *rt);
 								                if (!p) {
 								                        r = -ENOMEM;
 								                        goto fail;
 								                }
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
-												core: support subdirectories in RuntimeDirectory= option

											
										
										
											2017-07-17 09:30:53 +02:00
+								                r = mkdir_parents_label(p, 0755);
 								                if (r < 0)
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                        goto fail;
-												core: support subdirectories in RuntimeDirectory= option

											
										
										
											2017-07-17 09:30:53 +02:00
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                if (context->dynamic_user && type != EXEC_DIRECTORY_CONFIGURATION) {
 								                        _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
 								                        /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
 								                         * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
 								                         * whose UID is later on reused. To lock this down we use the same trick used by container
 								                         * managers to prohibit host users to get access to files of the same UID in containers: we
 								                         * place everything inside a directory that has an access mode of 0700 and is owned root:root,
 								                         * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
 								                         * to make this directory permeable for the service itself.
 								                         *
 								                         * Specifically: for a service which wants a special directory "foo/" we first create a
 								                         * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
 								                         * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
 								                         * privileged host users can access "foo/" as usual, but unprivileged host users can't look
 								                         * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
 								                         * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
 								                         * disabling the access boundary for the service and making sure it only gets access to the
 								                         * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
 								                         *
 								                         * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
 								                         * owned by the service itself. */
 								                        private_root = strjoin(params->prefix[type], "/private");
 								                        if (!private_root) {
 								                                r = -ENOMEM;
 								                                goto fail;
 								                        }
 								                        /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
-												mkdir: introduce follow_symlink flag to mkdir_safe{,_label}()

											
										
										
											2017-10-06 09:03:33 +02:00
+								                        r = mkdir_safe_label(private_root, 0700, 0, 0, false);
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                        if (r < 0)
 								                                goto fail;
 								                        pp = strjoin(private_root, "/", *rt);
 								                        if (!pp) {
 								                                r = -ENOMEM;
 								                                goto fail;
 								                        }
 								                        /* Create all directories between the configured directory and this private root, and mark them 0755 */
 								                        r = mkdir_parents_label(pp, 0755);
 								                        if (r < 0)
 								                                goto fail;
 								                        /* Finally, create the actual directory for the service */
 								                        r = mkdir_label(pp, context->directories[type].mode);
 								                        if (r < 0 && r != -EEXIST)
 								                                goto fail;
 								                        parent = dirname_malloc(p);
 								                        if (!parent) {
 								                                r = -ENOMEM;
 								                                goto fail;
 								                        }
 								                        r = path_make_relative(parent, pp, &relative);
 								                        if (r < 0)
 								                                goto fail;
 								                        /* And link it up from the original place */
 								                        r = symlink_idempotent(relative, p);
 								                        if (r < 0)
 								                                goto fail;
 								                        effective = pp;
 								                } else {
 								                        r = mkdir_label(p, context->directories[type].mode);
 								                        if (r < 0 && r != -EEXIST)
 								                                goto fail;
 								                        effective = p;
 								                }
-												core: chown() StateDirectory= and friends recursively when starting a service

This is particularly useful when used in conjunction with DynamicUser=1,
where the UID might change for every invocation, but is useful in other
cases too, for example, when these directories are shared between
systems where the UID assignments differ slightly.

											
										
										
											2017-09-28 19:13:44 +02:00
 								                /* First lock down the access mode */
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                if (chmod(effective, context->directories[type].mode) < 0) {
-												core: chown() StateDirectory= and friends recursively when starting a service

This is particularly useful when used in conjunction with DynamicUser=1,
where the UID might change for every invocation, but is useful in other
cases too, for example, when these directories are shared between
systems where the UID assignments differ slightly.

											
										
										
											2017-09-28 19:13:44 +02:00
+								                        r = -errno;
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                        goto fail;
-												core: chown() StateDirectory= and friends recursively when starting a service

This is particularly useful when used in conjunction with DynamicUser=1,
where the UID might change for every invocation, but is useful in other
cases too, for example, when these directories are shared between
systems where the UID assignments differ slightly.

											
										
										
											2017-09-28 19:13:44 +02:00
+								                }
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
-												core: don't chown() the configuration directory

The configuration directory is commonly not owned by a service, but
remains root-owned, hence don't change the owner automatically for it.

											
										
										
											2017-08-01 10:36:33 +02:00
+								                /* Don't change the owner of the configuration directory, as in the common case it is not written to by
 								                 * a service, and shall not be writable. */
 								                if (type == EXEC_DIRECTORY_CONFIGURATION)
 								                        continue;
-												core: chown() StateDirectory= and friends recursively when starting a service

This is particularly useful when used in conjunction with DynamicUser=1,
where the UID might change for every invocation, but is useful in other
cases too, for example, when these directories are shared between
systems where the UID assignments differ slightly.

											
										
										
											2017-09-28 19:13:44 +02:00
+								                /* Then, change the ownership of the whole tree, if necessary */
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                r = path_chown_recursive(effective, uid, gid);
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
+								                if (r < 0)
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                        goto fail;
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
+								        }
 								        return 0;
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
 								fail:
 								        *exit_status = exit_status_table[type];
 								        return r;
-												execute: split out creation of runtime dirs into its own functions

											
										
										
											2016-08-25 10:12:57 +02:00
+								}
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
+								static int setup_smack(
 								                const ExecContext *context,
 								                const ExecCommand *command) {
 								        int r;
 								        assert(context);
 								        assert(command);
 								        if (context->smack_process_label) {
 								                r = mac_smack_apply_pid(0, context->smack_process_label);
 								                if (r < 0)
 								                        return r;
 								        }
 								#ifdef SMACK_DEFAULT_PROCESS_LABEL
 								        else {
 								                _cleanup_free_ char *exec_label = NULL;
 								                r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
-												tree-wide: use IN_SET macro (#6977)


											
										
										
											2017-10-04 16:01:32 +02:00
+								                if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
-												execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.

											
										
										
											2016-08-26 17:40:42 +02:00
+								                        return r;
 								                r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
 								                if (r < 0)
 								                        return r;
 								        }
 								#endif
 								        return 0;
 								}
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								static int compile_bind_mounts(
 								                const ExecContext *context,
 								                const ExecParameters *params,
 								                BindMount **ret_bind_mounts,
 								                unsigned *ret_n_bind_mounts,
 								                char ***ret_empty_directories) {
 								        _cleanup_strv_free_ char **empty_directories = NULL;
 								        BindMount *bind_mounts;
 								        unsigned n, h = 0, i;
 								        ExecDirectoryType t;
 								        int r;
 								        assert(context);
 								        assert(params);
 								        assert(ret_bind_mounts);
 								        assert(ret_n_bind_mounts);
 								        assert(ret_empty_directories);
 								        n = context->n_bind_mounts;
 								        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
 								                if (!params->prefix[t])
 								                        continue;
 								                n += strv_length(context->directories[t].paths);
 								        }
 								        if (n <= 0) {
 								                *ret_bind_mounts = NULL;
 								                *ret_n_bind_mounts = 0;
 								                *ret_empty_directories = NULL;
 								                return 0;
 								        }
 								        bind_mounts = new(BindMount, n);
 								        if (!bind_mounts)
 								                return -ENOMEM;
-												core: fix segfault in compile_bind_mounts() when BindPaths= or BindReadOnlyPaths= is set

This fixes a bug introduced by 6c47cd7d3bf35c8158a0737f34fe2c5dc95e72d6.

Fixes #7055.

											
										
										
											2017-10-11 05:27:13 +02:00
+								        for (i = 0; i < context->n_bind_mounts; i++) {
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                BindMount *item = context->bind_mounts + i;
 								                char *s, *d;
 								                s = strdup(item->source);
 								                if (!s) {
 								                        r = -ENOMEM;
 								                        goto finish;
 								                }
 								                d = strdup(item->destination);
 								                if (!d) {
 								                        free(s);
 								                        r = -ENOMEM;
 								                        goto finish;
 								                }
 								                bind_mounts[h++] = (BindMount) {
 								                        .source = s,
 								                        .destination = d,
 								                        .read_only = item->read_only,
 								                        .recursive = item->recursive,
 								                        .ignore_enoent = item->ignore_enoent,
 								                };
 								        }
 								        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
 								                char **suffix;
 								                if (!params->prefix[t])
 								                        continue;
 								                if (strv_isempty(context->directories[t].paths))
 								                        continue;
 								                if (context->dynamic_user && t != EXEC_DIRECTORY_CONFIGURATION) {
 								                        char *private_root;
 								                        /* So this is for a dynamic user, and we need to make sure the process can access its own
 								                         * directory. For that we overmount the usually inaccessible "private" subdirectory with a
 								                         * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
 								                        private_root = strjoin(params->prefix[t], "/private");
 								                        if (!private_root) {
 								                                r = -ENOMEM;
 								                                goto finish;
 								                        }
 								                        r = strv_consume(&empty_directories, private_root);
 								                        if (r < 0) {
 								                                r = -ENOMEM;
 								                                goto finish;
 								                        }
 								                }
 								                STRV_FOREACH(suffix, context->directories[t].paths) {
 								                        char *s, *d;
 								                        if (context->dynamic_user && t != EXEC_DIRECTORY_CONFIGURATION)
 								                                s = strjoin(params->prefix[t], "/private/", *suffix);
 								                        else
 								                                s = strjoin(params->prefix[t], "/", *suffix);
 								                        if (!s) {
 								                                r = -ENOMEM;
 								                                goto finish;
 								                        }
 								                        d = strdup(s);
 								                        if (!d) {
 								                                free(s);
 								                                r = -ENOMEM;
 								                                goto finish;
 								                        }
 								                        bind_mounts[h++] = (BindMount) {
 								                                .source = s,
 								                                .destination = d,
 								                                .read_only = false,
 								                                .recursive = true,
 								                                .ignore_enoent = false,
 								                        };
 								                }
 								        }
 								        assert(h == n);
 								        *ret_bind_mounts = bind_mounts;
 								        *ret_n_bind_mounts = n;
 								        *ret_empty_directories = empty_directories;
 								        empty_directories = NULL;
 								        return (int) n;
 								finish:
 								        bind_mount_free_many(bind_mounts, h);
 								        return r;
 								}
-												core: skip ReadOnlyPaths= and other permission-related mounts on PermissionsStartOnly= (#5309)

ReadOnlyPaths=, ProtectHome=, InaccessiblePaths= and ProtectSystem= are
about restricting access and little more, hence they should be disabled
if PermissionsStartOnly= is used or ExecStart= lines are prefixed with a
"+". Do that.

(Note that we will still create namespaces and stuff, since that's about
a lot more than just permissions. We'll simply disable the effect of
the four options mentioned above, but nothing else mount related.)

This also adds a test for this, to ensure this works as intended.

No documentation updates, as the documentation are already vague enough
to support the new behaviour ("If true, the permission-related execution
options…"). We could clarify this further, but I think we might want to
extend the switches' behaviour a bit more in future, hence leave it at
this for now.

Fixes: #5308
											
										
										
											2017-02-12 06:44:46 +01:00
+								static int apply_mount_namespace(
 								                Unit *u,
 								                ExecCommand *command,
 								                const ExecContext *context,
 								                const ExecParameters *params,
 								                ExecRuntime *runtime) {
-												core: remove compile_read_write_paths()

From 6c47cd7d3bf35c8158a0737f34fe2c5dc95e72d6, RuntimeDirectory= and
their friends also imply BindPaths=. Thus, implying ReadWritePaths=
is meaningless.

											
										
										
											2017-10-13 14:13:25 +02:00
+								        _cleanup_strv_free_ char **empty_directories = NULL;
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
+								        char *tmp = NULL, *var = NULL;
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								        const char *root_dir = NULL, *root_image = NULL;
-												namespace: change NameSpace → Namespace

We generally use the casing "Namespace" for the word, and that's visible
in a number of user-facing interfaces, including "RestrictNamespace=" or
"JoinsNamespaceOf=". Let's make sure to use the same casing internally
too.

As discussed in #7024

											
										
										
											2017-10-10 09:49:20 +02:00
+								        NamespaceInfo ns_info = {
-												core: on DynamicUser= make sure that protecting sensitive paths is enforced (#4596)

This adds a variable that is always set to false to make sure that
protect paths inside sandbox are always enforced and not ignored. The only
case when it is set to true is on DynamicUser=no and RootDirectory=/chroot
is set. This allows users to use more our sandbox features inside RootDirectory=

The only exception is ProtectSystem=full|strict and when DynamicUser=yes
is implied. Currently RootDirectory= is not fully compatible with these
due to two reasons:

* /chroot/usr|etc has to be present on ProtectSystem=full
* /chroot// has to be a mount point on ProtectSystem=strict.

											
										
										
											2016-11-06 23:31:55 +01:00
+								                .ignore_protect_paths = false,
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
+								                .private_dev = context->private_devices,
 								                .protect_control_groups = context->protect_control_groups,
 								                .protect_kernel_tunables = context->protect_kernel_tunables,
 								                .protect_kernel_modules = context->protect_kernel_modules,
-												core: add a per-unit setting MountAPIVFS= for mounting /dev, /proc, /sys in conjunction with RootDirectory=

This adds a boolean unit file setting MountAPIVFS=. If set, the three
main API VFS mounts will be mounted for the service. This only has an
effect on RootDirectory=, which it makes a ton times more useful.

(This is basically the /dev + /proc + /sys mounting code posted in the
original #4727, but rebased on current git, and with the automatic logic
replaced by explicit logic controlled by a unit file setting)

											
										
										
											2016-12-22 23:34:35 +01:00
+								                .mount_apivfs = context->mount_apivfs,
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
+								        };
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        bool needs_sandboxing;
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								        BindMount *bind_mounts = NULL;
 								        unsigned n_bind_mounts = 0;
-												core: skip ReadOnlyPaths= and other permission-related mounts on PermissionsStartOnly= (#5309)

ReadOnlyPaths=, ProtectHome=, InaccessiblePaths= and ProtectSystem= are
about restricting access and little more, hence they should be disabled
if PermissionsStartOnly= is used or ExecStart= lines are prefixed with a
"+". Do that.

(Note that we will still create namespaces and stuff, since that's about
a lot more than just permissions. We'll simply disable the effect of
the four options mentioned above, but nothing else mount related.)

This also adds a test for this, to ensure this works as intended.

No documentation updates, as the documentation are already vague enough
to support the new behaviour ("If true, the permission-related execution
options…"). We could clarify this further, but I think we might want to
extend the switches' behaviour a bit more in future, hence leave it at
this for now.

Fixes: #5308
											
										
										
											2017-02-12 06:44:46 +01:00
+								        int r;
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
+								        assert(context);
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
+								        /* The runtime struct only contains the parent of the private /tmp,
 								         * which is non-accessible to world users. Inside of it there's a /tmp
 								         * that is sticky, and that's the one we want to use here. */
 								        if (context->private_tmp && runtime) {
 								                if (runtime->tmp_dir)
 								                        tmp = strjoina(runtime->tmp_dir, "/tmp");
 								                if (runtime->var_tmp_dir)
 								                        var = strjoina(runtime->var_tmp_dir, "/tmp");
 								        }
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								        if (params->flags & EXEC_APPLY_CHROOT) {
 								                root_image = context->root_image;
 								                if (!root_image)
 								                        root_dir = context->root_directory;
 								        }
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								        r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
 								        if (r < 0)
 								                return r;
-												core: on DynamicUser= make sure that protecting sensitive paths is enforced (#4596)

This adds a variable that is always set to false to make sure that
protect paths inside sandbox are always enforced and not ignored. The only
case when it is set to true is on DynamicUser=no and RootDirectory=/chroot
is set. This allows users to use more our sandbox features inside RootDirectory=

The only exception is ProtectSystem=full|strict and when DynamicUser=yes
is implied. Currently RootDirectory= is not fully compatible with these
due to two reasons:

* /chroot/usr|etc has to be present on ProtectSystem=full
* /chroot// has to be a mount point on ProtectSystem=strict.

											
										
										
											2016-11-06 23:31:55 +01:00
+								        /*
 								         * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
 								         * sandbox info, otherwise enforce it, don't ignore protected paths and
 								         * fail if we are enable to apply the sandbox inside the mount namespace.
 								         */
 								        if (!context->dynamic_user && root_dir)
 								                ns_info.ignore_protect_paths = true;
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
-												core: skip ReadOnlyPaths= and other permission-related mounts on PermissionsStartOnly= (#5309)

ReadOnlyPaths=, ProtectHome=, InaccessiblePaths= and ProtectSystem= are
about restricting access and little more, hence they should be disabled
if PermissionsStartOnly= is used or ExecStart= lines are prefixed with a
"+". Do that.

(Note that we will still create namespaces and stuff, since that's about
a lot more than just permissions. We'll simply disable the effect of
the four options mentioned above, but nothing else mount related.)

This also adds a test for this, to ensure this works as intended.

No documentation updates, as the documentation are already vague enough
to support the new behaviour ("If true, the permission-related execution
options…"). We could clarify this further, but I think we might want to
extend the switches' behaviour a bit more in future, hence leave it at
this for now.

Fixes: #5308
											
										
										
											2017-02-12 06:44:46 +01:00
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								        r = setup_namespace(root_dir, root_image,
-												core: remove compile_read_write_paths()

From 6c47cd7d3bf35c8158a0737f34fe2c5dc95e72d6, RuntimeDirectory= and
their friends also imply BindPaths=. Thus, implying ReadWritePaths=
is meaningless.

											
										
										
											2017-10-13 14:13:25 +02:00
+								                            &ns_info, context->read_write_paths,
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                            needs_sandboxing ? context->read_only_paths : NULL,
 								                            needs_sandboxing ? context->inaccessible_paths : NULL,
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                            empty_directories,
 								                            bind_mounts,
 								                            n_bind_mounts,
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
+								                            tmp,
 								                            var,
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                            needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
 								                            needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								                            context->mount_flags,
 								                            DISSECT_IMAGE_DISCARD_ON_LOOP);
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								        bind_mount_free_many(bind_mounts, n_bind_mounts);
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
+								        /* If we couldn't set up the namespace this is probably due to a
 								         * missing capability. In this case, silently proceeed. */
 								        if (IN_SET(r, -EPERM, -EACCES)) {
 								                log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
-												execute: drop explicit log_open()/log_close() now that it is unnecessary

											
										
										
											2017-09-26 17:41:53 +02:00
+								                return 0;
-												core: move the code that setups namespaces on its own function

											
										
										
											2016-10-27 09:20:18 +02:00
+								        }
 								        return r;
 								}
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								static int apply_working_directory(
 								                const ExecContext *context,
 								                const ExecParameters *params,
 								                const char *home,
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								                const bool needs_mount_ns,
 								                int *exit_status) {
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								        const char *d, *wd;
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
 								        assert(context);
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								        assert(exit_status);
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								        if (context->working_directory_home) {
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								                if (!home) {
 								                        *exit_status = EXIT_CHDIR;
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								                        return -ENXIO;
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								                }
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
+								                wd = home;
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
 								        } else if (context->working_directory)
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
+								                wd = context->working_directory;
 								        else
 								                wd = "/";
-												core: move apply working directory code into its own apply_working_directory()

											
										
										
											2016-10-27 09:21:44 +02:00
 								        if (params->flags & EXEC_APPLY_CHROOT) {
 								                if (!needs_mount_ns && context->root_directory)
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								                        if (chroot(context->root_directory) < 0) {
 								                                *exit_status = EXIT_CHROOT;
-												core: move apply working directory code into its own apply_working_directory()

											
										
										
											2016-10-27 09:21:44 +02:00
+								                                return -errno;
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								                        }
-												core: move apply working directory code into its own apply_working_directory()

											
										
										
											2016-10-27 09:21:44 +02:00
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
+								                d = wd;
 								        } else
-												execute: use prefix_roota() where appropriate

											
										
										
											2017-02-09 13:16:51 +01:00
+								                d = prefix_roota(context->root_directory, wd);
-												core: move apply working directory code into its own apply_working_directory()

											
										
										
											2016-10-27 09:21:44 +02:00
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								        if (chdir(d) < 0 && !context->working_directory_missing_ok) {
 								                *exit_status = EXIT_CHDIR;
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
+								                return -errno;
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								        }
-												core: move apply working directory code into its own apply_working_directory()

											
										
										
											2016-10-27 09:21:44 +02:00
 								        return 0;
 								}
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								static int setup_keyring(
 								                Unit *u,
 								                const ExecContext *context,
 								                const ExecParameters *p,
 								                uid_t uid, gid_t gid) {
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        key_serial_t keyring;
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								        int r;
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
 								        assert(u);
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								        assert(context);
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        assert(p);
 								        /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
 								         * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
 								         * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
 								         * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
 								         * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
 								         * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
 								        if (!(p->flags & EXEC_NEW_KEYRING))
 								                return 0;
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								        if (context->keyring_mode == EXEC_KEYRING_INHERIT)
 								                return 0;
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
 								        if (keyring == -1) {
 								                if (errno == ENOSYS)
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                        log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								                else if (IN_SET(errno, EACCES, EPERM))
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                        log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								                else if (errno == EDQUOT)
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                        log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								                else
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                        return log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
 								                return 0;
 								        }
-												core: store the invocation ID in the per-service keyring

Let's store the invocation ID in the per-service keyring as a root-owned key,
with strict access rights. This has the advantage over the environment-based ID
passing that it also works from SUID binaries (as they key cannot be overidden
by unprivileged code starting them), in contrast to the secure_getenv() based
mode.

The invocation ID is now passed in three different ways to a service:

- As environment variable $INVOCATION_ID. This is easy to use, but may be
  overriden by unprivileged code (which might be a bad or a good thing), which
  means it's incompatible with SUID code (see above).

- As extended attribute on the service cgroup. This cannot be overriden by
  unprivileged code, and may be queried safely from "outside" of a service.
  However, it is incompatible with containers right now, as unprivileged
  containers generally cannot set xattrs on cgroupfs.

- As "invocation_id" key in the kernel keyring. This has the benefit that the
  key cannot be changed by unprivileged service code, and thus is safe to
  access from SUID code (see above). But do note that service code can replace
  the session keyring with a fresh one that lacks the key. However in that case
  the key will not be owned by root, which is easily detectable. The keyring is
  also incompatible with containers right now, as it is not properly namespace
  aware (but this is being worked on), and thus most container managers mask
  the keyring-related system calls.

Ideally we'd only have one way to pass the invocation ID, but the different
ways all have limitations. The invocation ID hookup in journald is currently
only available on the host but not in containers, due to the mentioned
limitations.

How to verify the new invocation ID in the keyring:

 # systemd-run -t /bin/sh
 Running as unit: run-rd917366c04f847b480d486017f7239d6.service
 Press ^] three times within 1s to disconnect TTY.
 # keyctl show
 Session Keyring
  680208392 --alswrv      0     0  keyring: _ses
  250926536 ----s-rv      0     0   \_ user: invocation_id
 # keyctl request user invocation_id
 250926536
 # keyctl read 250926536
 16 bytes of data in key:
 9c96317c ac64495a a42b9cd7 4f3ff96b
 # echo $INVOCATION_ID
 9c96317cac64495aa42b9cd74f3ff96b
 # ^D

This creates a new transient service runnint a shell. Then verifies the
contents of the keyring, requests the invocation ID key, and reads its payload.
For comparison the invocation ID as passed via the environment variable is also
displayed.

											
										
										
											2016-12-02 15:05:55 +01:00
+								        /* Populate they keyring with the invocation ID by default. */
 								        if (!sd_id128_is_null(u->invocation_id)) {
 								                key_serial_t key;
 								                key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
 								                if (key == -1)
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                        log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
-												core: store the invocation ID in the per-service keyring

Let's store the invocation ID in the per-service keyring as a root-owned key,
with strict access rights. This has the advantage over the environment-based ID
passing that it also works from SUID binaries (as they key cannot be overidden
by unprivileged code starting them), in contrast to the secure_getenv() based
mode.

The invocation ID is now passed in three different ways to a service:

- As environment variable $INVOCATION_ID. This is easy to use, but may be
  overriden by unprivileged code (which might be a bad or a good thing), which
  means it's incompatible with SUID code (see above).

- As extended attribute on the service cgroup. This cannot be overriden by
  unprivileged code, and may be queried safely from "outside" of a service.
  However, it is incompatible with containers right now, as unprivileged
  containers generally cannot set xattrs on cgroupfs.

- As "invocation_id" key in the kernel keyring. This has the benefit that the
  key cannot be changed by unprivileged service code, and thus is safe to
  access from SUID code (see above). But do note that service code can replace
  the session keyring with a fresh one that lacks the key. However in that case
  the key will not be owned by root, which is easily detectable. The keyring is
  also incompatible with containers right now, as it is not properly namespace
  aware (but this is being worked on), and thus most container managers mask
  the keyring-related system calls.

Ideally we'd only have one way to pass the invocation ID, but the different
ways all have limitations. The invocation ID hookup in journald is currently
only available on the host but not in containers, due to the mentioned
limitations.

How to verify the new invocation ID in the keyring:

 # systemd-run -t /bin/sh
 Running as unit: run-rd917366c04f847b480d486017f7239d6.service
 Press ^] three times within 1s to disconnect TTY.
 # keyctl show
 Session Keyring
  680208392 --alswrv      0     0  keyring: _ses
  250926536 ----s-rv      0     0   \_ user: invocation_id
 # keyctl request user invocation_id
 250926536
 # keyctl read 250926536
 16 bytes of data in key:
 9c96317c ac64495a a42b9cd7 4f3ff96b
 # echo $INVOCATION_ID
 9c96317cac64495aa42b9cd74f3ff96b
 # ^D

This creates a new transient service runnint a shell. Then verifies the
contents of the keyring, requests the invocation ID key, and reads its payload.
For comparison the invocation ID as passed via the environment variable is also
displayed.

											
										
										
											2016-12-02 15:05:55 +01:00
+								                else {
 								                        if (keyctl(KEYCTL_SETPERM, key,
 								                                   KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
 								                                   KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                                return log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
-												core: store the invocation ID in the per-service keyring

Let's store the invocation ID in the per-service keyring as a root-owned key,
with strict access rights. This has the advantage over the environment-based ID
passing that it also works from SUID binaries (as they key cannot be overidden
by unprivileged code starting them), in contrast to the secure_getenv() based
mode.

The invocation ID is now passed in three different ways to a service:

- As environment variable $INVOCATION_ID. This is easy to use, but may be
  overriden by unprivileged code (which might be a bad or a good thing), which
  means it's incompatible with SUID code (see above).

- As extended attribute on the service cgroup. This cannot be overriden by
  unprivileged code, and may be queried safely from "outside" of a service.
  However, it is incompatible with containers right now, as unprivileged
  containers generally cannot set xattrs on cgroupfs.

- As "invocation_id" key in the kernel keyring. This has the benefit that the
  key cannot be changed by unprivileged service code, and thus is safe to
  access from SUID code (see above). But do note that service code can replace
  the session keyring with a fresh one that lacks the key. However in that case
  the key will not be owned by root, which is easily detectable. The keyring is
  also incompatible with containers right now, as it is not properly namespace
  aware (but this is being worked on), and thus most container managers mask
  the keyring-related system calls.

Ideally we'd only have one way to pass the invocation ID, but the different
ways all have limitations. The invocation ID hookup in journald is currently
only available on the host but not in containers, due to the mentioned
limitations.

How to verify the new invocation ID in the keyring:

 # systemd-run -t /bin/sh
 Running as unit: run-rd917366c04f847b480d486017f7239d6.service
 Press ^] three times within 1s to disconnect TTY.
 # keyctl show
 Session Keyring
  680208392 --alswrv      0     0  keyring: _ses
  250926536 ----s-rv      0     0   \_ user: invocation_id
 # keyctl request user invocation_id
 250926536
 # keyctl read 250926536
 16 bytes of data in key:
 9c96317c ac64495a a42b9cd7 4f3ff96b
 # echo $INVOCATION_ID
 9c96317cac64495aa42b9cd74f3ff96b
 # ^D

This creates a new transient service runnint a shell. Then verifies the
contents of the keyring, requests the invocation ID key, and reads its payload.
For comparison the invocation ID as passed via the environment variable is also
displayed.

											
										
										
											2016-12-02 15:05:55 +01:00
+								                }
 								        }
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        /* And now, make the keyring owned by the service's user */
 								        if (uid_is_valid(uid) || gid_is_valid(gid))
 								                if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                        return log_unit_error_errno(u, errno, "Failed to change ownership of session keyring: %m");
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								        /* When requested link the user keyring into the session keyring. */
 								        if (context->keyring_mode == EXEC_KEYRING_SHARED) {
 								                uid_t saved_uid;
 								                gid_t saved_gid;
 								                /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
 								                 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
 								                 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
 								                saved_uid = getuid();
 								                saved_gid = getgid();
 								                if (gid_is_valid(gid) && gid != saved_gid) {
 								                        if (setregid(gid, -1) < 0)
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                                return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								                }
 								                if (uid_is_valid(uid) && uid != saved_uid) {
 								                        if (setreuid(uid, -1) < 0) {
 								                                (void) setregid(saved_gid, -1);
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                                return log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								                        }
 								                }
 								                if (keyctl(KEYCTL_LINK,
 								                           KEY_SPEC_USER_KEYRING,
 								                           KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
 								                        r = -errno;
 								                        (void) setreuid(saved_uid, -1);
 								                        (void) setregid(saved_gid, -1);
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                        return log_unit_error_errno(u, r, "Failed to link user keyring into session keyring: %m");
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								                }
 								                if (uid_is_valid(uid) && uid != saved_uid) {
 								                        if (setreuid(saved_uid, -1) < 0) {
 								                                (void) setregid(saved_gid, -1);
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                                return log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								                        }
 								                }
 								                if (gid_is_valid(gid) && gid != saved_gid) {
 								                        if (setregid(saved_gid, -1) < 0)
-												execute: rework logging in setup_keyring() to include unit info

Let's use log_unit_error() instead of log_error() everywhere (and
friends).

											
										
										
											2017-09-26 17:42:57 +02:00
+								                                return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								                }
-												Move one space from dbus-execute.c to execute.c

The number of spaces is conserved ;)

											
										
										
											2017-09-16 08:45:02 +02:00
+								        }
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        return 0;
 								}
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
 								        assert(array);
 								        assert(n);
 								        if (!pair)
 								                return;
 								        if (pair[0] >= 0)
 								                array[(*n)++] = pair[0];
 								        if (pair[1] >= 0)
 								                array[(*n)++] = pair[1];
 								}
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								static int close_remaining_fds(
 								                const ExecParameters *params,
 								                ExecRuntime *runtime,
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								                DynamicCreds *dcreds,
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								                int user_lookup_fd,
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								                int socket_fd,
 								                int *fds, unsigned n_fds) {
 								        unsigned n_dont_close = 0;
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								        int dont_close[n_fds + 12];
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
 								        assert(params);
 								        if (params->stdin_fd >= 0)
 								                dont_close[n_dont_close++] = params->stdin_fd;
 								        if (params->stdout_fd >= 0)
 								                dont_close[n_dont_close++] = params->stdout_fd;
 								        if (params->stderr_fd >= 0)
 								                dont_close[n_dont_close++] = params->stderr_fd;
 								        if (socket_fd >= 0)
 								                dont_close[n_dont_close++] = socket_fd;
 								        if (n_fds > 0) {
 								                memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
 								                n_dont_close += n_fds;
 								        }
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								        if (runtime)
 								                append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
 								        if (dcreds) {
 								                if (dcreds->user)
 								                        append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
 								                if (dcreds->group)
 								                        append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								        }
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								        if (user_lookup_fd >= 0)
 								                dont_close[n_dont_close++] = user_lookup_fd;
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								        return close_all_fds(dont_close, n_dont_close);
 								}
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								static int send_user_lookup(
 								                Unit *unit,
 								                int user_lookup_fd,
 								                uid_t uid,
 								                gid_t gid) {
 								        assert(unit);
 								        /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
 								         * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
 								         * specified. */
 								        if (user_lookup_fd < 0)
 								                return 0;
 								        if (!uid_is_valid(uid) && !gid_is_valid(gid))
 								                return 0;
 								        if (writev(user_lookup_fd,
 								               (struct iovec[]) {
-												io-util: add new IOVEC_INIT/IOVEC_MAKE macros

This adds IOVEC_INIT() and IOVEC_MAKE() for initializing iovec structures
from a pointer and a size. On top of these IOVEC_INIT_STRING() and
IOVEC_MAKE_STRING() are added which take a string and automatically
determine the size of the string using strlen().

This patch removes the old IOVEC_SET_STRING() macro, given that
IOVEC_MAKE_STRING() is now useful for similar purposes. Note that the
old IOVEC_SET_STRING() invocations were two characters shorter than the
new ones using IOVEC_MAKE_STRING(), but I think the new syntax is more
readable and more generic as it simply resolves to a C99 literal
structure initialization. Moreover, we can use very similar syntax now
for initializing strings and pointer+size iovec entries. We canalso use
the new macros to initialize function parameters on-the-fly or array
definitions. And given that we shouldn't have so many ways to do the
same stuff, let's just settle on the new macros.

(This also converts some code to use _cleanup_ where dynamically
allocated strings were using IOVEC_SET_STRING() before, to modernize
things a bit)

											
										
										
											2017-09-21 13:52:34 +02:00
+								                           IOVEC_INIT(&uid, sizeof(uid)),
 								                           IOVEC_INIT(&gid, sizeof(gid)),
 								                           IOVEC_INIT_STRING(unit->id) }, 3) < 0)
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								                return -errno;
 								        return 0;
 								}
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
 								        int r;
 								        assert(c);
 								        assert(home);
 								        assert(buf);
 								        /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
 								        if (*home)
 								                return 0;
 								        if (!c->working_directory_home)
 								                return 0;
 								        if (uid == 0) {
 								                /* Hardcode /root as home directory for UID 0 */
 								                *home = "/root";
 								                return 1;
 								        }
 								        r = get_home_dir(buf);
 								        if (r < 0)
 								                return r;
 								        *home = *buf;
 								        return 1;
 								}
-												core: when looking for a UID to use for a dynamic UID start with the current owner of the StateDirectory= and friends

Let's optimize dynamic UID allocation a bit: if a StateDirectory= (or
suchlike) is configured, we start our allocation loop from that UID and
use it if it currently isn't used otherwise. This is beneficial as it
saves us from having to expensively recursively chown() these
directories in the typical case (which StateDirectory= does when it
notices that the owner of the directory doesn't match the UID picked).

With this in place we now have the a three-phase logic for allocating a
dynamic UID:

a) first, we try to use the owning UID of StateDirectory=,
   CacheDirectory=, LogDirectory= if that exists and is currently
   otherwise unused.

b) if that didn't work out, we hash the UID from the service name

c) if that didn't yield an unused UID either, randomly pick new ones
   until we find a free one.

											
										
										
											2017-09-28 20:28:09 +02:00
+								static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
 								        _cleanup_strv_free_ char ** list = NULL;
 								        ExecDirectoryType t;
 								        int r;
 								        assert(c);
 								        assert(p);
 								        assert(ret);
 								        assert(c->dynamic_user);
 								        /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
 								         * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
 								         * directories. */
 								        for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
 								                char **i;
 								                if (t == EXEC_DIRECTORY_CONFIGURATION)
 								                        continue;
 								                if (!p->prefix[t])
 								                        continue;
 								                STRV_FOREACH(i, c->directories[t].paths) {
 								                        char *e;
 								                        e = strjoin(p->prefix[t], "/private/", *i);
 								                        if (!e)
 								                                return -ENOMEM;
 								                        r = strv_consume(&list, e);
 								                        if (r < 0)
 								                                return r;
 								                }
 								        }
 								        *ret = list;
 								        list = NULL;
 								        return 0;
 								}
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								static int exec_child(
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                Unit *unit,
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                ExecCommand *command,
 								                const ExecContext *context,
 								                const ExecParameters *params,
 								                ExecRuntime *runtime,
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								                DynamicCreds *dcreds,
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                char **argv,
 								                int socket_fd,
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                int named_iofds[3],
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								                int *fds,
 								                unsigned n_storage_fds,
-												core: only apply NonBlocking= to fds passed via socket activation

Make sure to only apply the O_NONBLOCK flag to the fds passed via socket
activation.

Previously the flag was also applied to the fds which came from the fd store
but this was incorrect since services, after being restarted, expect that these
passed fds have their flags unchanged and can be reused as before.

The documentation was a bit unclear about this so clarify it.

											
										
										
											2017-05-12 11:32:53 +02:00
+								                unsigned n_socket_fds,
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                char **files_env,
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								                int user_lookup_fd,
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                int *exit_status) {
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								        _cleanup_free_ char *mac_selinux_context_net = NULL, *home_buffer = NULL;
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        _cleanup_free_ gid_t *supplementary_gids = NULL;
 								        const char *username = NULL, *groupname = NULL;
-												core: get the working directory value inside apply_working_directory()

Improve apply_working_directory() and lets get the current working directory
inside of it.

											
										
										
											2016-10-27 09:28:54 +02:00
+								        const char *home = NULL, *shell = NULL;
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								        dev_t journal_stream_dev = 0;
 								        ino_t journal_stream_ino = 0;
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
 								                needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
 								                needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
 								                needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SELINUX
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								        bool use_selinux = false;
-												core: define variables only when they are required

Follow-up for 7f18ef0a555a3c3cef08e0965dc453fe5954b5a7.

											
										
										
											2017-08-02 07:38:08 +02:00
+								#endif
-												build-sys: s/HAVE_SMACK/ENABLE_SMACK/

Same justification as for HAVE_UTMP.

											
										
										
											2017-10-03 12:22:40 +02:00
+								#if ENABLE_SMACK
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								        bool use_smack = false;
-												core: define variables only when they are required

Follow-up for 7f18ef0a555a3c3cef08e0965dc453fe5954b5a7.

											
										
										
											2017-08-02 07:38:08 +02:00
+								#endif
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_APPARMOR
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								        bool use_apparmor = false;
-												core: define variables only when they are required

Follow-up for 7f18ef0a555a3c3cef08e0965dc453fe5954b5a7.

											
										
										
											2017-08-02 07:38:08 +02:00
+								#endif
-												treewide: introduce UID_INVALID (and friends) as macro for (uid_t) -1

											
										
										
											2014-11-28 20:51:01 +01:00
+								        uid_t uid = UID_INVALID;
 								        gid_t gid = GID_INVALID;
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								        int i, r, ngids = 0;
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								        unsigned n_fds;
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        ExecDirectoryType dt;
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        int secure_bits;
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        assert(unit);
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								        assert(command);
 								        assert(context);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        assert(params);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        assert(exit_status);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
 								        rename_process_from_path(command->path);
 								        /* We reset exactly these signals, since they are the
 								         * only ones we set to SIG_IGN in the main daemon. All
 								         * others we leave untouched because we set them to
 								         * SIG_DFL or a valid handler initially, both of which
 								         * will be demoted to SIG_DFL. */
-												tree-wide: whenever we fork off a foreign child process reset signal mask/handlers

Also, when the child is potentially long-running make sure to set a
death signal.

Also, ignore the result of the reset operations explicitly by casting
them to (void).

											
										
										
											2015-05-31 23:55:55 +02:00
+								        (void) default_signals(SIGNALS_CRASH_HANDLER,
 								                               SIGNALS_IGNORE, -1);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
 								        if (context->ignore_sigpipe)
-												tree-wide: whenever we fork off a foreign child process reset signal mask/handlers

Also, when the child is potentially long-running make sure to set a
death signal.

Also, ignore the result of the reset operations explicitly by casting
them to (void).

											
										
										
											2015-05-31 23:55:55 +02:00
+								                (void) ignore_signals(SIGPIPE, -1);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        r = reset_signal_mask();
 								        if (r < 0) {
 								                *exit_status = EXIT_SIGNAL_MASK;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (params->idle_pipe)
 								                do_idle_pipe_dance(params->idle_pipe);
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
-												execute: make use of the new logging mode in execute.c

											
										
										
											2017-09-26 17:45:32 +02:00
+								        /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
 								         * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
 								         * any fds open we don't really want open during the transition. In order to make logging work, we switch the
 								         * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        log_forget_fds();
-												execute: make use of the new logging mode in execute.c

											
										
										
											2017-09-26 17:45:32 +02:00
+								        log_set_open_when_needed(true);
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
-												execute: let's close glibc syslog channels too

Just in case something opened them, let's make sure glibc invalidates
them too.

Thankfully so far no library opened log channels behind our back, at
least as far as I know, hence this is actually a NOP, but let's better
be safe than sorry.

											
										
										
											2017-09-26 17:52:25 +02:00
+								        /* In case anything used libc syslog(), close this here, too */
 								        closelog();
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								        n_fds = n_storage_fds + n_socket_fds;
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								        r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r < 0) {
 								                *exit_status = EXIT_FDS;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
+								        }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (!context->same_pgrp)
 								                if (setsid() < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_SETSID;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                }
-												core: add minimal templating system

											
										
										
											2010-04-15 03:11:11 +02:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								        exec_context_tty_reset(context, params);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
-												core: confirm_spawn: always accept units with same_pgrp set for now

For some reasons units remaining in the same process group as PID 1
(same_pgrp=true) fail to acquire the console even if it's not taken by anyone.

So always accept for units with same_pgrp set for now.

											
										
										
											2016-11-14 17:37:40 +01:00
+								        if (unit_shall_confirm_spawn(unit)) {
-												core: allow to redirect confirmation messages to a different console

It's rather hard to parse the confirmation messages (enabled with
systemd.confirm_spawn=true) amongst the status messages and the kernel
ones (if enabled).

This patch gives the possibility to the user to redirect the confirmation
message to a different virtual console, either by giving its name or its path,
so those messages are separated from the other ones and easier to read.

											
										
										
											2016-11-02 10:38:22 +01:00
+								                const char *vc = params->confirm_spawn;
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                _cleanup_free_ char *cmdline = NULL;
 								                cmdline = exec_command_line(argv);
 								                if (!cmdline) {
-												execute: improve and augment execution log messages

Let's generate friendly messages for more cases, and make slight
adjustments to the existing messages.

											
										
										
											2017-09-15 16:42:09 +02:00
+								                        *exit_status = EXIT_MEMORY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_oom();
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
-												core: add 'i' in confirm spawn to give a short summary of the unit to spawn

											
										
										
											2016-11-12 14:55:12 +01:00
+								                r = ask_for_confirmation(vc, unit, cmdline);
-												core: rework ask_for_confirmation()

Now the reponses are handled by ask_for_confirmation() as well as the report of
any errors occuring during the process of retrieving the confirmation response.

One benefit of this is that there's no need to open/close the console one more
time when reporting error/status messages.

The caller now just needs to care about the return values whose meanings are:

 - don't execute and pretend that the command failed
 - don't execute and pretend that the command succeeed
 - positive answer, execute the command

Also some slight code reorganization and introduce write_confirm_error() and
write_confirm_error_fd(). write_confim_message becomes unneeded.

											
										
										
											2016-11-02 13:51:02 +01:00
+								                if (r != CONFIRM_EXECUTE) {
 								                        if (r == CONFIRM_PRETEND_SUCCESS) {
 								                                *exit_status = EXIT_SUCCESS;
 								                                return 0;
 								                        }
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_CONFIRM;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        log_unit_error(unit, "Execution cancelled by the user");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                        return -ECANCELED;
 								                }
 								        }
-												execute: improve exec_spawn() logging

											
										
										
											2010-04-10 17:46:01 +02:00
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								        if (context->dynamic_user && dcreds) {
-												core: when looking for a UID to use for a dynamic UID start with the current owner of the StateDirectory= and friends

Let's optimize dynamic UID allocation a bit: if a StateDirectory= (or
suchlike) is configured, we start our allocation loop from that UID and
use it if it currently isn't used otherwise. This is beneficial as it
saves us from having to expensively recursively chown() these
directories in the typical case (which StateDirectory= does when it
notices that the owner of the directory doesn't match the UID picked).

With this in place we now have the a three-phase logic for allocating a
dynamic UID:

a) first, we try to use the owning UID of StateDirectory=,
   CacheDirectory=, LogDirectory= if that exists and is currently
   otherwise unused.

b) if that didn't work out, we hash the UID from the service name

c) if that didn't yield an unused UID either, randomly pick new ones
   until we find a free one.

											
										
										
											2017-09-28 20:28:09 +02:00
+								                _cleanup_strv_free_ char **suggested_paths = NULL;
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
-												nss: add new "nss-systemd" NSS module for mapping dynamic users

With this NSS module all dynamic service users will be resolvable via NSS like
any real user.

											
										
										
											2016-07-14 19:19:49 +02:00
+								                /* Make sure we bypass our own NSS module for any NSS checks */
 								                if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
 								                        *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
-												nss: add new "nss-systemd" NSS module for mapping dynamic users

With this NSS module all dynamic service users will be resolvable via NSS like
any real user.

											
										
										
											2016-07-14 19:19:49 +02:00
+								                }
-												core: when looking for a UID to use for a dynamic UID start with the current owner of the StateDirectory= and friends

Let's optimize dynamic UID allocation a bit: if a StateDirectory= (or
suchlike) is configured, we start our allocation loop from that UID and
use it if it currently isn't used otherwise. This is beneficial as it
saves us from having to expensively recursively chown() these
directories in the typical case (which StateDirectory= does when it
notices that the owner of the directory doesn't match the UID picked).

With this in place we now have the a three-phase logic for allocating a
dynamic UID:

a) first, we try to use the owning UID of StateDirectory=,
   CacheDirectory=, LogDirectory= if that exists and is currently
   otherwise unused.

b) if that didn't work out, we hash the UID from the service name

c) if that didn't yield an unused UID either, randomly pick new ones
   until we find a free one.

											
										
										
											2017-09-28 20:28:09 +02:00
+								                r = compile_suggested_paths(context, params, &suggested_paths);
 								                if (r < 0) {
 								                        *exit_status = EXIT_MEMORY;
 								                        return log_oom();
 								                }
 								                r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                if (r < 0) {
 								                        *exit_status = EXIT_USER;
-												core: fix invalid error message

The error message corresponds to EILSEQ is "Invalid or incomplete
multibyte or wide character", and is not suitable in this case.
So, let's show a custom error message when the function
dynamic_creds_realize() returns -EILSEQ.

											
										
										
											2017-10-18 01:57:54 +02:00
+								                        if (r == -EILSEQ) {
 								                                log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
 								                                return -EOPNOTSUPP;
 								                        }
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
-												journal: call connect() with dropped privileges

When systemd starts a service, it first opened /run/systemd/journal/stdout
socket, and only later switched to the right user.group (if they are
specified). Later on, journald looked at the credentials, and saw
root.root, because credentials are stored at the time the socket is
opened. As a result, all messages passed over _TRANSPORT=stdout were
logged with _UID=0, _GID=0.

Drop real uid and gid temporarily to fix the issue.

											
										
										
											2015-01-01 04:40:41 +01:00
+								                }
-												pid1: provide a more detailed error message when execution fails (#5074)

Fixes #5000.
											
										
										
											2017-01-18 04:38:55 +01:00
+								                if (!uid_is_valid(uid)) {
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								                        *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
-												pid1: provide a more detailed error message when execution fails (#5074)

Fixes #5000.
											
										
										
											2017-01-18 04:38:55 +01:00
+								                        return -ESRCH;
 								                }
 								                if (!gid_is_valid(gid)) {
 								                        *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								                        return -ESRCH;
 								                }
-												core: fix group ownership when Group is set

When Group is set in the unit, the runtime directories are owned by
this group and not the default group of the user (same for cgroup paths
and standard outputs)

Fix #1231

											
										
										
											2015-09-21 15:45:51 +02:00
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								                if (dcreds->user)
 								                        username = dcreds->user->name;
 								        } else {
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
 								                if (r < 0) {
 								                        *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
-												core: fix group ownership when Group is set

When Group is set in the unit, the runtime directories are owned by
this group and not the default group of the user (same for cgroup paths
and standard outputs)

Fix #1231

											
										
										
											2015-09-21 15:45:51 +02:00
+								                }
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                r = get_fixed_group(context, &groupname, &gid);
 								                if (r < 0) {
 								                        *exit_status = EXIT_GROUP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
-												core: first lookup and cache creds then apply them after namespace setup

This fixes: https://github.com/systemd/systemd/issues/4357

Let's lookup and cache creds then apply them. We also switch from
getgroups() to getgrouplist().

											
										
										
											2016-10-23 23:24:14 +02:00
+								                }
-												core: intialize user aux groups and SupplementaryGroups= when DynamicUser= is set

Make sure that when DynamicUser= is set that we intialize the user
supplementary groups and that we also support SupplementaryGroups=

Fixes: https://github.com/systemd/systemd/issues/4539

Thanks Evgeny Vereshchagin (@evverx)

											
										
										
											2016-11-02 22:42:40 +01:00
+								        }
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
-												core: intialize user aux groups and SupplementaryGroups= when DynamicUser= is set

Make sure that when DynamicUser= is set that we intialize the user
supplementary groups and that we also support SupplementaryGroups=

Fixes: https://github.com/systemd/systemd/issues/4539

Thanks Evgeny Vereshchagin (@evverx)

											
										
										
											2016-11-02 22:42:40 +01:00
+								        /* Initialize user supplementary groups and get SupplementaryGroups= ones */
 								        r = get_supplementary_groups(context, username, groupname, gid,
 								                                     &supplementary_gids, &ngids);
 								        if (r < 0) {
 								                *exit_status = EXIT_GROUP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								        }
-												core: fix group ownership when Group is set

When Group is set in the unit, the runtime directories are owned by
this group and not the default group of the user (same for cgroup paths
and standard outputs)

Fix #1231

											
										
										
											2015-09-21 15:45:51 +02:00
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								        r = send_user_lookup(unit, user_lookup_fd, uid, gid);
 								        if (r < 0) {
 								                *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								        }
 								        user_lookup_fd = safe_close(user_lookup_fd);
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								        r = acquire_home(context, uid, &home, &home_buffer);
 								        if (r < 0) {
 								                *exit_status = EXIT_CHDIR;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
-												execute: set working directory to /root if User= is not set, but WorkingDirectory=~ is

Or actually, try to to do the right thing depending on what is
available:

- If we know $HOME from User=, then use that.
- If the UID for the service is 0, hardcode that WorkingDirectory=~ means WorkingDirectory=/root
- In any other case (which will be the unprivileged --user case), use
  get_home_dir() to find the $HOME of the user we are running as.
- Otherwise fail.

Fixes: #5246 #5124

											
										
										
											2017-02-09 11:58:39 +01:00
+								        }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        /* If a socket is connected to STDIN/STDOUT/STDERR, we
 								         * must sure to drop O_NONBLOCK */
 								        if (socket_fd >= 0)
-												core: add support for setting stdin/stdout/stderr for transient services

When starting a transient service, allow setting stdin/stdout/stderr fds
for it, by passing them in via the bus.

This also simplifies some of the serialization code for units.

											
										
										
											2015-10-07 23:07:39 +02:00
+								                (void) fd_nonblock(socket_fd, false);
-												yay, we can start socket units

											
										
										
											2010-01-27 04:31:52 +01:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        r = setup_input(context, params, socket_fd, named_iofds);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r < 0) {
 								                *exit_status = EXIT_STDIN;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r < 0) {
 								                *exit_status = EXIT_STDOUT;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r < 0) {
 								                *exit_status = EXIT_STDERR;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
 								        if (params->cgroup_path) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
 								                if (r < 0) {
 								                        *exit_status = EXIT_CGROUP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
-												reset signal mask when forking

											
										
										
											2010-01-27 06:17:51 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												reset signal mask when forking

											
										
										
											2010-01-27 06:17:51 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->oom_score_adjust_set) {
-												core: make EPERM errors when applying OOM adjustment for forked processes non-fatal

This should be useful for user namespaces.

											
										
										
											2015-01-08 23:12:16 +01:00
+								                char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
-												service: introduce Type=idle and use it for gettys

Type=idle is much like Type=simple, however between the fork() and the
exec() in the child we wait until PID 1 informs us that no jobs are
left.

This is mostly a cosmetic fix to make gettys appear only after all boot
output is finished and complete.

Note that this does not impact the normal job logic as we do not delay
the completion of any jobs. We just delay the invocation of the actual
binary, and only for services that otherwise would be of Type=simple.

											
										
										
											2012-04-24 14:28:00 +02:00
-												core: make EPERM errors when applying OOM adjustment for forked processes non-fatal

This should be useful for user namespaces.

											
										
										
											2015-01-08 23:12:16 +01:00
+								                /* When we can't make this change due to EPERM, then
 								                 * let's silently skip over it. User namespaces
 								                 * prohibit write access to this file, and we
 								                 * shouldn't trip up over that. */
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
-												core: make EPERM errors when applying OOM adjustment for forked processes non-fatal

This should be useful for user namespaces.

											
										
										
											2015-01-08 23:12:16 +01:00
+								                sprintf(t, "%i", context->oom_score_adjust);
-												tree-wide: fix write_string_file() user that should not create files

The latest consolidation cleanup of write_string_file() revealed some users
of that helper which should have used write_string_file_no_create() in the
past but didn't. Basically, all existing users that write to files in /sys
and /proc should not expect to write to a file which is not yet existant.

											
										
										
											2015-07-07 01:27:20 +02:00
+								                r = write_string_file("/proc/self/oom_score_adj", t, 0);
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                if (IN_SET(r, -EPERM, -EACCES))
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                else if (r < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_OOM_ADJUST;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
 								        if (context->nice_set)
 								                if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_NICE;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->cpu_sched_set) {
 								                struct sched_param param = {
 								                        .sched_priority = context->cpu_sched_priority,
 								                };
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                r = sched_setscheduler(0,
 								                                       context->cpu_sched_policy |
 								                                       (context->cpu_sched_reset_on_fork ?
 								                                        SCHED_RESET_ON_FORK : 0),
 								                                       &param);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SETSCHEDULER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
-												execute: close inherited fds earlier

											
										
										
											2010-07-12 20:34:53 +02:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												execute: close inherited fds earlier

											
										
										
											2010-07-12 20:34:53 +02:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->cpuset)
 								                if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_CPUAFFINITY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->ioprio_set)
 								                if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_IOPRIO;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                }
-												exec: make sure O_NONBLOCK is off for all sockets passed as STDIN/STDOUT/STDERR

											
										
										
											2010-08-30 23:31:27 +02:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->timer_slack_nsec != NSEC_INFINITY)
 								                if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_TIMERSLACK;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
-												execute: log errors from "sd(EXEC)"

To give the administrator more hints about failures occuring in spawning
of commands than just the exit code, log the strerror.
All fds are closed, so reopen the log.

Related-to: https://bugzilla.redhat.com/show_bug.cgi?id=752901

											
										
										
											2011-11-17 00:21:16 +01:00
+								                }
-												support chrooting/setting of ioprio when spawning

											
										
										
											2010-01-29 20:46:22 +01:00
-												util-lib: wrap personality() to fix up broken glibc error handling (#6766)

glibc appears to propagate different errors in different ways, let's fix
this up, so that our own code doesn't get confused by this.

See #6752 + #6737 for details.

Fixes: #6755
											
										
										
											2017-09-08 16:16:29 +02:00
+								        if (context->personality != PERSONALITY_INVALID) {
 								                r = safe_personality(context->personality);
 								                if (r < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        *exit_status = EXIT_PERSONALITY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
-												execute: log errors from "sd(EXEC)"

To give the administrator more hints about failures occuring in spawning
of commands than just the exit code, log the strerror.
All fds are closed, so reopen the log.

Related-to: https://bugzilla.redhat.com/show_bug.cgi?id=752901

											
										
										
											2011-11-17 00:21:16 +01:00
+								                }
-												util-lib: wrap personality() to fix up broken glibc error handling (#6766)

glibc appears to propagate different errors in different ways, let's fix
this up, so that our own code doesn't get confused by this.

See #6752 + #6737 for details.

Fixes: #6755
											
										
										
											2017-09-08 16:16:29 +02:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->utmp_id)
-												tree-wide: make use of getpid_cached() wherever we can

This moves pretty much all uses of getpid() over to getpid_raw(). I
didn't specifically check whether the optimization is worth it for each
replacement, but in order to keep things simple and systematic I
switched over everything at once.

											
										
										
											2017-07-20 16:19:18 +02:00
+								                utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
-												core/execute: pass the username to utmp/wtmp database

Before previous commit, username would be NULL for root, and set only
for other users. So the argument passed to utmp_put_init_process()
would be "root" for other users and NULL for root. Seems strange.
Instead, always pass the username if available.

											
										
										
											2017-02-03 17:32:42 +01:00
+								                                      context->tty_path,
-												core: optionally create LOGIN_PROCESS or USER_PROCESS utmp entries

When generating utmp/wtmp entries, optionally add both LOGIN_PROCESS and
INIT_PROCESS entries or even all three of LOGIN_PROCESS, INIT_PROCESS
and USER_PROCESS entries, instead of just a single INIT_PROCESS entry.

With this change systemd may be used to not only invoke a getty directly
in a SysV-compliant way but alternatively also a login(1) implementation
or even forego getty and login entirely, and invoke arbitrary shells in
a way that they appear in who(1) or w(1).

This is preparation for a later commit that adds a "machinectl shell"
operation to invoke a shell in a container, in a way that is compatible
with who(1) and w(1).

											
										
										
											2015-08-23 13:14:04 +02:00
+								                                      context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
 								                                      context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
 								                                      USER_PROCESS,
-												core/execute: pass the username to utmp/wtmp database

Before previous commit, username would be NULL for root, and set only
for other users. So the argument passed to utmp_put_init_process()
would be "root" for other users and NULL for root. Seems strange.
Instead, always pass the username if available.

											
										
										
											2017-02-03 17:32:42 +01:00
+								                                      username);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
-												core: chown() any TTY used for stdin, not just when StandardInput=tty is used (#4347)

If stdin is supplied as an fd for transient units (using the
StandardInputFileDescriptor pseudo-property for transient units), then we
should also fix up the TTY ownership, not just when we opened the TTY
ourselves.

This simply drops the explicit is_terminal_input()-based check. Note that
chown_terminal() internally does a much more appropriate isatty()-based check
anyway, hence we can drop this without replacement.

Fixes: #4260
											
										
										
											2016-10-11 20:07:22 +02:00
+								        if (context->user) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                r = chown_terminal(STDIN_FILENO, uid);
 								                if (r < 0) {
 								                        *exit_status = EXIT_STDIN;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												cgroup: add cgroupsification

											
										
										
											2010-03-31 16:29:55 +02:00
-												core: introduce new Delegate=yes/no property controlling creation of cgroup subhierarchies

For priviliged units this resource control property ensures that the
processes have all controllers systemd manages enabled.

For unpriviliged services (those with User= set) this ensures that
access rights to the service cgroup is granted to the user in question,
to create further subgroups. Note that this only applies to the
name=systemd hierarchy though, as access to other controllers is not
safe for unpriviliged processes.

Delegate=yes should be set for container scopes where a systemd instance
inside the container shall manage the hierarchies below its own cgroup
and have access to all controllers.

Delegate=yes should also be set for user@.service, so that systemd
--user can run, controlling its own cgroup tree.

This commit changes machined, systemd-nspawn@.service and user@.service
to set this boolean, in order to ensure that container management will
just work, and the user systemd instance can run fine.

											
										
										
											2014-11-05 17:57:23 +01:00
+								        /* If delegation is enabled we'll pass ownership of the cgroup
 								         * (but only in systemd's own controller hierarchy!) to the
 								         * user of the new process. */
-												execute: also fold the cgroup delegate bit into ExecFlags

											
										
										
											2017-08-01 10:51:18 +02:00
+								        if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid);
 								                if (r < 0) {
 								                        *exit_status = EXIT_CGROUP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                }
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0755, uid, gid);
 								                if (r < 0) {
 								                        *exit_status = EXIT_CGROUP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								        for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
-												execute: add one more ExecFlags flag, for controlling unconditional directory chowning

Let's decouple the Manager object from the execution logic a bit more
here too, and simply pass along the fact whether we should
unconditionally chown the runtime/... directories via the ExecFlags
field too.

											
										
										
											2017-08-01 10:35:10 +02:00
+								                r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                if (r < 0)
 								                        return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								        r = build_environment(
-												core: bypass dynamic user lookups from dbus-daemon

dbus-daemon does NSS name look-ups in order to enforce its bus policy. This
might dead-lock if an NSS module use wants to use D-Bus for the look-up itself,
like our nss-systemd does. Let's work around this by bypassing bus
communication in the NSS module if we run inside of dbus-daemon. To make this
work we keep a bit of extra state in /run/systemd/dynamic-uid/ so that we don't
have to consult the bus, but can still resolve the names.

Note that the normal codepath continues to be via the bus, so that resolving
works from all mount namespaces and is subject to authentication, as before.

This is a bit dirty, but not too dirty, as dbus daemon is kinda special anyway
for PID 1.

											
										
										
											2016-08-02 12:28:51 +02:00
+								                        unit,
-												core: set $JOURNAL_STREAM to the dev_t/ino_t of the journal stream of executed services

This permits services to detect whether their stdout/stderr is connected to the
journal, and if so talk to the journal directly, thus permitting carrying of
metadata.

As requested by the gtk folks: #2473

											
										
										
											2016-06-14 16:50:45 +02:00
+								                        context,
 								                        params,
 								                        n_fds,
 								                        home,
 								                        username,
 								                        shell,
 								                        journal_stream_dev,
 								                        journal_stream_ino,
 								                        &our_env);
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        if (r < 0) {
 								                *exit_status = EXIT_MEMORY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_oom();
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        }
 								        r = build_pass_environment(context, &pass_env);
 								        if (r < 0) {
 								                *exit_status = EXIT_MEMORY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_oom();
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        }
 								        accum_env = strv_env_merge(5,
 								                                   params->environment,
 								                                   our_env,
 								                                   pass_env,
 								                                   context->environment,
 								                                   files_env,
 								                                   NULL);
 								        if (!accum_env) {
 								                *exit_status = EXIT_MEMORY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_oom();
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        }
-												execute: Cleanup the environment early

By cleaning up before setting up PAM we maintain control of overriding
behavior in setting variables. Otherwise, pam_putenv is in control.
This also makes sure we use a cleaned up environment in replacing
variables in argv.

											
										
										
											2016-07-07 12:36:33 +02:00
+								        accum_env = strv_env_clean(accum_env);
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
-												execute: drop group priviliges only after setting up namespace

If PrivateDevices=yes is set, the namespace code creates device nodes in /dev
that should be owned by the host's root, hence let's make sure we set up the
namespace before dropping group privileges.

											
										
										
											2016-08-25 17:29:12 +02:00
+								        (void) umask(context->umask);
-												exec: move mac_smack_apply_pid() and setup_pam() to same condition block

This cleans up exec_child() function by moving mac_smack_apply_pid()
and setup_pam() to the same condition block, since both of them have
the same condition (i.e params->apply_permissions). It improves
readability without changing its operation.

											
										
										
											2015-09-23 13:53:09 +02:00
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								        r = setup_keyring(unit, context, params, uid, gid);
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        if (r < 0) {
 								                *exit_status = EXIT_KEYRING;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
-												core: run each system service with a fresh session keyring

This patch ensures that each system service gets its own session kernel keyring
automatically, and implicitly. Without this a keyring is allocated for it
on-demand, but is then linked with the user's kernel keyring, which is OK
behaviour for logged in users, but not so much for system services.

With this change each service gets a session keyring that is specific to the
service and ceases to exist when the service is shut down. The session keyring
is not linked up with the user keyring and keys hence only search within the
session boundaries by default.

(This is useful in a later commit to store per-service material in the keyring,
for example the invocation ID)

(With input from David Howells)

											
										
										
											2016-12-02 01:54:41 +01:00
+								        }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
-												core: rename EXEC_APPLY_PERMISSIONS → EXEC_APPLY_SANDBOXING

"Permissions" was a bit of a misnomer, as it suggests that UNIX file
permission bits are adjusted, which aren't really changed here. Instead,
this is about UNIX credentials such as users or groups, as well as
namespacing, hence let's use a more generic term here, without any
misleading reference to UNIX file permissions: "sandboxing", which shall
refer to all kinds of sandboxing technologies, including UID/GID
dropping, selinux relabelling, namespacing, seccomp, and so on.

											
										
										
											2017-08-01 11:30:44 +02:00
+								        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
 								        needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
 								        if (needs_ambient_hack)
 								                needs_setuid = false;
 								        else
 								                needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
 								        if (needs_sandboxing) {
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
+								                /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
 								                 * present. The actual MAC context application will happen later, as late as possible, to avoid
 								                 * impacting our own code paths. */
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SELINUX
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                use_selinux = mac_selinux_use();
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
+								#endif
-												build-sys: s/HAVE_SMACK/ENABLE_SMACK/

Same justification as for HAVE_UTMP.

											
										
										
											2017-10-03 12:22:40 +02:00
+								#if ENABLE_SMACK
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                use_smack = mac_smack_use();
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
+								#endif
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_APPARMOR
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                use_apparmor = mac_apparmor_use();
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
+								#endif
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        }
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_setuid) {
 								                if (context->pam_name && username) {
 								                        r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_PAM;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                        }
 								                }
-												exec: move mac_smack_apply_pid() and setup_pam() to same condition block

This cleans up exec_child() function by moving mac_smack_apply_pid()
and setup_pam() to the same condition block, since both of them have
the same condition (i.e params->apply_permissions). It improves
readability without changing its operation.

											
										
										
											2015-09-23 13:53:09 +02:00
+								        }
-												core: add Personality= option for units to set the personality for spawned processes

											
										
										
											2014-02-19 02:15:24 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
-												namespace: fall back gracefully when kernel doesn't support network namespaces (#7024)


											
										
										
											2017-10-10 09:46:13 +02:00
+								                if (ns_type_supported(NAMESPACE_NET)) {
 								                        r = setup_netns(runtime->netns_storage_socket);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_NETWORK;
 								                                return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
 								                        }
 								                } else
 								                        log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												service: optionally, create INIT_PROCESS/DEAD_PROCESS entries for a service

This should fix accounting for pam_limits and suchlike.

https://bugzilla.redhat.com/show_bug.cgi?id=636036

											
										
										
											2010-10-08 16:06:23 +02:00
-												core: Private*/Protect* options with RootDirectory

When a service is chrooted with the option RootDirectory=/opt/..., then
the options PrivateDevices, PrivateTmp, ProtectHome, ProtectSystem must
mount the directories under $RootDirectory/{dev,tmp,home,usr,boot}.

The test-ns tool can test setup_namespace() with and without chroot:
 $ sudo TEST_NS_PROJECTS=/home/lennart/projects ./test-ns
 $ sudo TEST_NS_CHROOT=/home/alban/debian-tree TEST_NS_PROJECTS=/home/alban/debian-tree/home/alban/Documents ./test-ns

											
										
										
											2015-05-18 12:20:28 +02:00
+								        needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
 								        if (needs_mount_namespace) {
-												core: skip ReadOnlyPaths= and other permission-related mounts on PermissionsStartOnly= (#5309)

ReadOnlyPaths=, ProtectHome=, InaccessiblePaths= and ProtectSystem= are
about restricting access and little more, hence they should be disabled
if PermissionsStartOnly= is used or ExecStart= lines are prefixed with a
"+". Do that.

(Note that we will still create namespaces and stuff, since that's about
a lot more than just permissions. We'll simply disable the effect of
the four options mentioned above, but nothing else mount related.)

This also adds a test for this, to ensure this works as intended.

No documentation updates, as the documentation are already vague enough
to support the new behaviour ("If true, the permission-related execution
options…"). We could clarify this further, but I think we might want to
extend the switches' behaviour a bit more in future, hence leave it at
this for now.

Fixes: #5308
											
										
										
											2017-02-12 06:44:46 +01:00
+								                r = apply_mount_namespace(unit, command, context, params, runtime);
-												execute: if RuntimeDirectory= is set, it should be writable

Implicitly make all dirs set with RuntimeDirectory= writable, as the concept
otherwise makes no sense.

											
										
										
											2016-08-25 10:42:38 +02:00
+								                if (r < 0) {
 								                        *exit_status = EXIT_NAMESPACE;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
-												execute: if RuntimeDirectory= is set, it should be writable

Implicitly make all dirs set with RuntimeDirectory= writable, as the concept
otherwise makes no sense.

											
										
										
											2016-08-25 10:42:38 +02:00
+								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core: lets apply working directory just after mount namespaces

This makes applying groups after applying the working directory, this
may allow some flexibility but at same it is not a big deal since we
don't execute or do anything between applying working directory and
droping groups.

											
										
										
											2016-10-25 16:24:35 +02:00
+								        /* Apply just after mount namespace setup */
-												execute: set the right exit status for CHDIR vs. CHROOT

Fixes: #5125

											
										
										
											2017-02-09 13:17:00 +01:00
+								        r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								        if (r < 0)
 								                return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
-												core: lets apply working directory just after mount namespaces

This makes applying groups after applying the working directory, this
may allow some flexibility but at same it is not a big deal since we
don't execute or do anything between applying working directory and
droping groups.

											
										
										
											2016-10-25 16:24:35 +02:00
-												core: initialize groups list before checking SupplementaryGroups= of a unit (#4533)

Always initialize the supplementary groups of caller before checking the
unit SupplementaryGroups= option.

Fixes https://github.com/systemd/systemd/issues/4531
											
										
										
											2016-11-02 17:51:35 +01:00
+								        /* Drop groups as early as possbile */
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_setuid) {
-												core: cleanup for enforce_groups() (#7064)

SupplementaryGroups= is preprocessed in get_supplementary_groups().
So, it is not necessary to input ExecContext to enforce_groups().
											
										
										
											2017-10-12 08:10:25 +02:00
+								                r = enforce_groups(gid, supplementary_gids, ngids);
-												execute: drop group priviliges only after setting up namespace

If PrivateDevices=yes is set, the namespace code creates device nodes in /dev
that should be owned by the host's root, hence let's make sure we set up the
namespace before dropping group privileges.

											
										
										
											2016-08-25 17:29:12 +02:00
+								                if (r < 0) {
 								                        *exit_status = EXIT_GROUP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
-												execute: drop group priviliges only after setting up namespace

If PrivateDevices=yes is set, the namespace code creates device nodes in /dev
that should be owned by the host's root, hence let's make sure we set up the
namespace before dropping group privileges.

											
										
										
											2016-08-25 17:29:12 +02:00
+								                }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        }
-												execute: drop group priviliges only after setting up namespace

If PrivateDevices=yes is set, the namespace code creates device nodes in /dev
that should be owned by the host's root, hence let's make sure we set up the
namespace before dropping group privileges.

											
										
										
											2016-08-25 17:29:12 +02:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_sandboxing) {
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SELINUX
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
-												execute: simplify needs_sandboxing checking

Let's merge three if blocks that shall only run when sandboxing is applied
into one.

Note that this changes behaviour in one corner case: PrivateUsers=1 is
now honours both PermissionsStartOnly= and the "+" modifier in
ExecStart=, and not just the former, as before. This was an oversight,
so let's fix this now, at a point in time the option isn't used much
yet.

											
										
										
											2017-08-01 11:44:37 +02:00
+								                        r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_SELINUX_CONTEXT;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
-												execute: simplify needs_sandboxing checking

Let's merge three if blocks that shall only run when sandboxing is applied
into one.

Note that this changes behaviour in one corner case: PrivateUsers=1 is
now honours both PermissionsStartOnly= and the "+" modifier in
ExecStart=, and not just the former, as before. This was an oversight,
so let's fix this now, at a point in time the option isn't used much
yet.

											
										
										
											2017-08-01 11:44:37 +02:00
+								                        }
-												selinux: figure out selinux context applied on exec() before closing all fds

We need original socket_fd around otherwise mac_selinux_get_child_mls_label
fails with -EINVAL return code. Also don't call setexeccon twice but rather pass
context value of SELinuxContext option as an extra argument.

											
										
										
											2014-11-12 13:53:27 +01:00
+								                }
 								#endif
-												execute: simplify needs_sandboxing checking

Let's merge three if blocks that shall only run when sandboxing is applied
into one.

Note that this changes behaviour in one corner case: PrivateUsers=1 is
now honours both PermissionsStartOnly= and the "+" modifier in
ExecStart=, and not just the former, as before. This was an oversight,
so let's fix this now, at a point in time the option isn't used much
yet.

											
										
										
											2017-08-01 11:44:37 +02:00
+								                if (context->private_users) {
 								                        r = setup_private_users(uid, gid);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
-												execute: simplify needs_sandboxing checking

Let's merge three if blocks that shall only run when sandboxing is applied
into one.

Note that this changes behaviour in one corner case: PrivateUsers=1 is
now honours both PermissionsStartOnly= and the "+" modifier in
ExecStart=, and not just the former, as before. This was an oversight,
so let's fix this now, at a point in time the option isn't used much
yet.

											
										
										
											2017-08-01 11:44:37 +02:00
+								                        }
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								                }
 								        }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
 								         * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
 								         * was needed to upload the policy and can now be closed as well. */
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        r = close_all_fds(fds, n_fds);
 								        if (r >= 0)
 								                r = shift_fds(fds, n_fds);
 								        if (r >= 0)
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								                r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r < 0) {
 								                *exit_status = EXIT_FDS;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        secure_bits = context->secure_bits;
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_sandboxing) {
 								                uint64_t bset;
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                for (i = 0; i < _RLIMIT_MAX; i++) {
-												execute: use the return value of setrlimit_closest() properly

It's a function defined by us, hence we should look for the error in its return
value, not in "errno".

											
										
										
											2016-06-23 01:31:24 +02:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                        if (!context->rlimit[i])
 								                                continue;
-												execute: use the return value of setrlimit_closest() properly

It's a function defined by us, hence we should look for the error in its return
value, not in "errno".

											
										
										
											2016-06-23 01:31:24 +02:00
+								                        r = setrlimit_closest(i, context->rlimit[i]);
 								                        if (r < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                                *exit_status = EXIT_LIMITS;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								                        }
 								                }
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								                /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
 								                if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
 								                        if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
 								                                *exit_status = EXIT_LIMITS;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								                        }
 								                }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                bset = context->capability_bounding_set;
 								                /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
 								                 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
 								                 * instead of us doing that */
 								                if (needs_ambient_hack)
 								                        bset |= (UINT64_C(1) << CAP_SETPCAP) |
 								                                (UINT64_C(1) << CAP_SETUID) |
 								                                (UINT64_C(1) << CAP_SETGID);
 								                if (!cap_test_all(bset)) {
 								                        r = capability_bounding_set_drop(bset, false);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        if (r < 0) {
 								                                *exit_status = EXIT_CAPABILITIES;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
-												execute: do initgroups() first, pam initialization second so that it can still modify the groups list

											
										
										
											2011-06-30 02:15:01 +02:00
+								                        }
-												execute: log errors from "sd(EXEC)"

To give the administrator more hints about failures occuring in spawning
of commands than just the exit code, log the strerror.
All fds are closed, so reopen the log.

Related-to: https://bugzilla.redhat.com/show_bug.cgi?id=752901

											
										
										
											2011-11-17 00:21:16 +01:00
+								                }
-												execute: do initgroups() first, pam initialization second so that it can still modify the groups list

											
										
										
											2011-06-30 02:15:01 +02:00
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								                /* This is done before enforce_user, but ambient set
 								                 * does not survive over setresuid() if keep_caps is not set. */
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                if (!needs_ambient_hack &&
 								                    context->capability_ambient_set != 0) {
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								                        r = capability_ambient_set_apply(context->capability_ambient_set, true);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_CAPABILITIES;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								                        }
 								                }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        }
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_setuid) {
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                if (context->user) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                        r = enforce_user(context, uid);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_USER;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                        }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
 								                        if (!needs_ambient_hack &&
 								                            context->capability_ambient_set != 0) {
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
 								                                /* Fix the ambient capabilities after user change. */
 								                                r = capability_ambient_set_apply(context->capability_ambient_set, false);
 								                                if (r < 0) {
 								                                        *exit_status = EXIT_CAPABILITIES;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                        return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								                                }
 								                                /* If we were asked to change user and ambient capabilities
 								                                 * were requested, we had to add keep-caps to the securebits
 								                                 * so that we would maintain the inherited capability set
 								                                 * through the setresuid(). Make sure that the bit is added
 								                                 * also to the context secure_bits so that we don't try to
 								                                 * drop the bit away next. */
-												tree-wide: indentation fixes

											
										
										
											2016-02-25 00:27:56 +01:00
+								                                secure_bits |= 1<<SECURE_KEEP_CAPS;
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								                        }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								                }
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								        if (needs_sandboxing) {
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
 								                 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
 								                 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
 								                 * are restricted. */
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SELINUX
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                if (use_selinux) {
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                        char *exec_context = mac_selinux_context_net ?: context->selinux_context;
 								                        if (exec_context) {
 								                                r = setexeccon(exec_context);
 								                                if (r < 0) {
 								                                        *exit_status = EXIT_SELINUX_CONTEXT;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                        return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                                }
 								                        }
 								                }
 								#endif
-												build-sys: s/HAVE_SMACK/ENABLE_SMACK/

Same justification as for HAVE_UTMP.

											
										
										
											2017-10-03 12:22:40 +02:00
+								#if ENABLE_SMACK
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                if (use_smack) {
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
+								                        r = setup_smack(context, command);
 								                        if (r < 0) {
 								                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
+								                        }
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                }
-												core: check which MACs to use before a new mount ns is created (#6498)

/sys is not guaranteed to exist when a new mount namespace is created.
It is only mounted under conditions specified by
`namespace_info_mount_apivfs`.

Checking if the three available MAC LSMs are enabled requires a sysfs
mounted at /sys, so the checks are moved to before a new mount ns is
created.
											
										
										
											2017-08-01 09:15:18 +02:00
+								#endif
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_APPARMOR
-												execute: needs_{selinux,apparmor,smack} → use_{selinux,apparmor,smack}

These booleans simply store whether selinux/apparmor/smack are supposed
ot be used, and chache the various mac_xyz_use() calls before we
transition into the namespace, hence let's use the same verb for the
variables and the functions: "use"

											
										
										
											2017-08-08 19:49:04 +02:00
+								                if (use_apparmor && context->apparmor_profile) {
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                        r = aa_change_onexec(context->apparmor_profile);
 								                        if (r < 0 && !context->apparmor_profile_ignore) {
 								                                *exit_status = EXIT_APPARMOR_PROFILE;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                        }
 								                }
 								#endif
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
 								                 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								                if (prctl(PR_GET_SECUREBITS) != secure_bits)
 								                        if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                                *exit_status = EXIT_SECUREBITS;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
-												exec: introduce PrivateNetwork= process option to turn off network access to specific services

											
										
										
											2011-08-02 05:24:58 +02:00
+								                        }
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                if (context_has_no_new_privileges(context))
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                        if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                                *exit_status = EXIT_NO_NEW_PRIVILEGES;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                                return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                        }
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                r = apply_address_families(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_ADDRESS_FAMILIES;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
-												execute: log errors from "sd(EXEC)"

To give the administrator more hints about failures occuring in spawning
of commands than just the exit code, log the strerror.
All fds are closed, so reopen the log.

Related-to: https://bugzilla.redhat.com/show_bug.cgi?id=752901

											
										
										
											2011-11-17 00:21:16 +01:00
+								                }
-												execute: setup namespace after doing NSS calls

											
										
										
											2010-06-16 16:39:28 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                r = apply_memory_deny_write_execute(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
+								                }
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                r = apply_restrict_realtime(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								                }
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								                r = apply_restrict_namespaces(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								                }
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                r = apply_protect_sysctl(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
-												core:sandbox: Add ProtectKernelModules= option

This is useful to turn off explicit module load and unload operations on modular
kernels. This option removes CAP_SYS_MODULE from the capability bounding set for
the unit, and installs a system call filter to block module system calls.

This option will not prevent the kernel from loading modules using the module
auto-load feature which is a system wide operation.

											
										
										
											2016-10-12 13:31:21 +02:00
+								                }
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                r = apply_protect_kernel_modules(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                }
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                r = apply_private_devices(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                }
 								                r = apply_syscall_archs(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
-												execute: filter low-level I/O syscalls if PrivateDevices= is set

If device access is restricted via PrivateDevices=, let's also block the
various low-level I/O syscalls at the same time, so that we know that the
minimal set of devices in our virtualized /dev are really everything the unit
can access.

											
										
										
											2016-08-26 16:39:04 +02:00
+								                }
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
+								                r = apply_lock_personality(unit, context);
 								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
+								                }
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								                /* This really should remain the last step before the execve(), to make sure our own code is unaffected
 								                 * by the filter as little as possible. */
-												core: add two new special ExecStart= character prefixes

This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

											
										
										
											2017-08-09 16:09:04 +02:00
+								                r = apply_syscall_filter(unit, context, needs_ambient_hack);
-												seccomp: rework seccomp code, to improve compat with some archs

This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.

So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.

This rework hence changes a couple of things:

- We no longer use seccomp_rule_add(), but only
  seccomp_rule_add_exact(), and fail the installation of a filter if the
  architecture doesn't support it.

- We no longer rely on adding multiple syscall architectures to a single filter,
  but instead install a separate filter for each syscall architecture
  supported. This way, we can install a strict filter for x86-64, while
  permitting a less strict filter for i386.

- All high-level filter additions are now moved from execute.c to
  seccomp-util.c, so that we can test them independently of the service
  execution logic.

- Tests have been added for all types of our seccomp filters.

- SystemCallFilters= and SystemCallArchitectures= are now implemented in
  independent filters and installation logic, as they semantically are
  very much independent of each other.

Fixes: #4575

											
										
										
											2016-12-27 15:28:25 +01:00
+								                if (r < 0) {
 								                        *exit_status = EXIT_SECCOMP;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                }
 								#endif
 								        }
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
+								        if (!strv_isempty(context->unset_environment)) {
 								                char **ee = NULL;
 								                ee = strv_env_delete(accum_env, 1, context->unset_environment);
 								                if (!ee) {
 								                        *exit_status = EXIT_MEMORY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        return log_oom();
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
+								                }
 								                strv_free(accum_env);
 								                accum_env = ee;
 								        }
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        final_argv = replace_env_argv(argv, accum_env);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (!final_argv) {
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                *exit_status = EXIT_MEMORY;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                return log_oom();
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												tree-wide: remove unnecessary LOG_PRI

LOG_DEBUG is already a log level, there is no need to use LOG_PRI which
is for filtering out the facility.

											
										
										
											2015-01-06 06:29:40 +01:00
+								        if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                _cleanup_free_ char *line;
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                line = exec_command_line(final_argv);
 								                if (line) {
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        log_struct(LOG_DEBUG,
 								                                   "EXECUTABLE=%s", command->path,
 								                                   LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
-												tree-wide: mark log_struct with _printf_ and fix fallout

log_struct takes multiple format strings, each one followed by arguments.
The _printf_ annotation is not sufficiently flexible to express this,
but we can still annotate the first format string, though not its
arguments (because their number is unknown).

With the annotation, the places which specified the message id or similar
as the first pattern cause a warning from -Wformat-nonliteral. This can
be trivially fixed by putting the MESSAGE= first.

This change will help find issues where a non-literal is erroneously used
as the pattern.

											
										
										
											2017-04-20 20:15:28 +02:00
+								                                   LOG_UNIT_ID(unit),
-												core: make sure to log invocation ID of units also when doing structured logging

											
										
										
											2017-09-20 18:27:53 +02:00
+								                                   LOG_UNIT_INVOCATION_ID(unit),
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                                   NULL);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                }
 								        }
-												core: when we cannot add PID to a scope cgroup, log about it

Also, place the scope unit in failed state.

											
										
										
											2015-04-28 12:20:29 +02:00
-												core/execute: pass env vars to PAM session setup (#3503)

Move the merger of environment variables before setting up the PAM
session and pass the aggregate environment to PAM setup. This allows
control over the PAM session hooks through environment variables.

PAM session initiation may update the environment. On successful
initiation of a PAM session, we adopt the environment of the
PAM context.
											
										
										
											2016-06-13 12:50:12 +02:00
+								        execve(command->path, final_argv, accum_env);
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
 								        if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
 								                log_struct_errno(LOG_INFO, errno,
 								                                 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
 								                                 LOG_UNIT_ID(unit),
 								                                 LOG_UNIT_INVOCATION_ID(unit),
 								                                 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
 								                                                  command->path),
 								                                 "EXECUTABLE=%s", command->path,
 								                                 NULL);
 								                return 0;
 								        }
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        *exit_status = EXIT_EXEC;
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								        return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								}
-												execute: implement privilige dropping properly

											
										
										
											2010-02-14 22:43:08 +01:00
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								int exec_spawn(Unit *unit,
 								               ExecCommand *command,
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								               const ExecContext *context,
 								               const ExecParameters *params,
 								               ExecRuntime *runtime,
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								               DynamicCreds *dcreds,
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								               pid_t *ret) {
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        _cleanup_strv_free_ char **files_env = NULL;
-												core: only apply NonBlocking= to fds passed via socket activation

Make sure to only apply the O_NONBLOCK flag to the fds passed via socket
activation.

Previously the flag was also applied to the fds which came from the fd store
but this was incorrect since services, after being restarted, expect that these
passed fds have their flags unchanged and can be reused as before.

The documentation was a bit unclear about this so clarify it.

											
										
										
											2017-05-12 11:32:53 +02:00
+								        int *fds = NULL;
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								        unsigned n_storage_fds = 0, n_socket_fds = 0;
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        _cleanup_free_ char *line = NULL;
 								        int socket_fd, r;
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        int named_iofds[3] = { -1, -1, -1 };
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        char **argv;
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        pid_t pid;
-												execute: support syscall filtering using seccomp filters

											
										
										
											2012-07-17 04:17:53 +02:00
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        assert(unit);
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        assert(command);
 								        assert(context);
 								        assert(ret);
 								        assert(params);
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								        assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
-												core: add new RestrictAddressFamilies= switch

This new unit settings allows restricting which address families are
available to processes. This is an effective way to minimize the attack
surface of services, by turning off entire network stacks for them.

This is based on seccomp, and does not work on x86-32, since seccomp
cannot filter socketcall() syscalls on that platform.

											
										
										
											2014-02-25 20:37:03 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (context->std_input == EXEC_INPUT_SOCKET ||
 								            context->std_output == EXEC_OUTPUT_SOCKET ||
 								            context->std_error == EXEC_OUTPUT_SOCKET) {
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								                if (params->n_socket_fds > 1) {
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        log_unit_error(unit, "Got more than one socket.");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                        return -EINVAL;
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                }
-												core: Add AppArmor profile switching

This permit to switch to a specific apparmor profile when starting a daemon. This
will result in a non operation if apparmor is disabled.
It also add a new build requirement on libapparmor for using this feature.

											
										
										
											2014-02-20 16:19:44 +01:00
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								                if (params->n_socket_fds == 0) {
-												execute: Properly log errors considering socket fds (#5910)

Till now if the params->n_fds was 0, systemd was logging that there were
more than one sockets.

Thanks @gregoryp and @VFXcode who did the most work debugging this.
											
										
										
											2017-05-09 01:09:22 +02:00
+								                        log_unit_error(unit, "Got no socket.");
 								                        return -EINVAL;
 								                }
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								                socket_fd = params->fds[0];
 								        } else {
 								                socket_fd = -1;
 								                fds = params->fds;
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								                n_storage_fds = params->n_storage_fds;
-												core: only apply NonBlocking= to fds passed via socket activation

Make sure to only apply the O_NONBLOCK flag to the fds passed via socket
activation.

Previously the flag was also applied to the fds which came from the fd store
but this was incorrect since services, after being restarted, expect that these
passed fds have their flags unchanged and can be reused as before.

The documentation was a bit unclear about this so clarify it.

											
										
										
											2017-05-12 11:32:53 +02:00
+								                n_socket_fds = params->n_socket_fds;
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        r = exec_context_named_iofds(unit, context, params, named_iofds);
 								        if (r < 0)
 								                return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        r = exec_context_load_environment(unit, context, &files_env);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								        if (r < 0)
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        argv = params->argv ?: command->argv;
 								        line = exec_command_line(argv);
 								        if (!line)
 								                return log_oom();
-												execute: support minimal environment variable replacement when executing processes

											
										
										
											2010-07-08 04:09:59 +02:00
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        log_struct(LOG_DEBUG,
 								                   LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
 								                   "EXECUTABLE=%s", command->path,
-												tree-wide: mark log_struct with _printf_ and fix fallout

log_struct takes multiple format strings, each one followed by arguments.
The _printf_ annotation is not sufficiently flexible to express this,
but we can still annotate the first format string, though not its
arguments (because their number is unknown).

With the annotation, the places which specified the message id or similar
as the first pattern cause a warning from -Wformat-nonliteral. This can
be trivially fixed by putting the MESSAGE= first.

This change will help find issues where a non-literal is erroneously used
as the pattern.

											
										
										
											2017-04-20 20:15:28 +02:00
+								                   LOG_UNIT_ID(unit),
-												core: make sure to log invocation ID of units also when doing structured logging

											
										
										
											2017-09-20 18:27:53 +02:00
+								                   LOG_UNIT_INVOCATION_ID(unit),
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                   NULL);
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        pid = fork();
 								        if (pid < 0)
-												core:execute: fix fork() fail handling in exec_spawn()

    If pid < 0 after fork(), 0 is always returned because r =
    exec_context_load_environment() has exited successfully.

    This will make the caller of exec_spawn() not able to handle
    the fork() error case and make systemd abort assert() possibly.

											
										
										
											2015-11-26 04:46:40 +01:00
+								                return log_unit_error_errno(unit, errno, "Failed to fork: %m");
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
 								        if (pid == 0) {
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                int exit_status = EXIT_SUCCESS;
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                r = exec_child(unit,
 								                               command,
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                               context,
 								                               params,
 								                               runtime,
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								                               dcreds,
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                               argv,
 								                               socket_fd,
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                               named_iofds,
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								                               fds,
 								                               n_storage_fds,
-												core: only apply NonBlocking= to fds passed via socket activation

Make sure to only apply the O_NONBLOCK flag to the fds passed via socket
activation.

Previously the flag was also applied to the fds which came from the fd store
but this was incorrect since services, after being restarted, expect that these
passed fds have their flags unchanged and can be reused as before.

The documentation was a bit unclear about this so clarify it.

											
										
										
											2017-05-12 11:32:53 +02:00
+								                               n_socket_fds,
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                               files_env,
-												core: add RemoveIPC= setting

This adds the boolean RemoveIPC= setting to service, socket, mount and swap
units (i.e.  all unit types that may invoke processes). if turned on, and the
unit's user/group is not root, all IPC objects of the user/group are removed
when the service is shut down. The life-cycle of the IPC objects is hence bound
to the unit life-cycle.

This is particularly relevant for units with dynamic users, as it is essential
that no objects owned by the dynamic users survive the service exiting. In
fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set.

In order to communicate the UID/GID of an executed process back to PID 1 this
adds a new "user lookup" socket pair, that is inherited into the forked
processes, and closed before the exec(). This is needed since we cannot do NSS
from PID 1 due to deadlock risks, However need to know the used UID/GID in
order to clean up IPC owned by it if the unit shuts down.

											
										
										
											2016-08-01 19:24:40 +02:00
+								                               unit->manager->user_lookup_fds[1],
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                               &exit_status);
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                if (r < 0) {
-												execute: normalize logging in execute.c

Now that logging can implicitly reopen the log streams when needed we
can log errors without any special magic, hence let's normalize things,
and log the same way we do everywhere else.

											
										
										
											2017-09-26 17:47:27 +02:00
+								                        log_struct_errno(LOG_ERR, r,
 								                                         "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
 								                                         LOG_UNIT_ID(unit),
 								                                         LOG_UNIT_INVOCATION_ID(unit),
 								                                         LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
 								                                                          exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
 								                                                          command->path),
 								                                         "EXECUTABLE=%s", command->path,
 								                                         NULL);
-												execute: log errors from "sd(EXEC)"

To give the administrator more hints about failures occuring in spawning
of commands than just the exit code, log the strerror.
All fds are closed, so reopen the log.

Related-to: https://bugzilla.redhat.com/show_bug.cgi?id=752901

											
										
										
											2011-11-17 00:21:16 +01:00
+								                }
-												core: modernize execution code a bit

Among other things, avoid log_struct() unless we really need it.

Also, use "r" as variable to store function errors in, instead of "err".
"r" is pretty much what we use everywhere else, hence using the same
here make sense.

FInally, in the child, when we want to log, make sure to open the
logging framework first, since it is explicitly closed in preparation
for the exec().

											
										
										
											2015-01-09 00:13:33 +01:00
+								                _exit(exit_status);
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        }
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
-												systemd: use structured logging for unit changes

Information which unit a log entry pertains to enables systemctl
status to display more log messages.

											
										
										
											2012-10-11 00:11:24 +02:00
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        /* We add the new process to the cgroup both in the child (so
 								         * that we can be sure that no user code is ever executed
 								         * outside of the cgroup) and in the parent (so that we can be
 								         * sure that when we kill the cgroup the process will be
 								         * killed too). */
-												exec: move code executed after fork into exec_child()

This factors out one conditional branch that has grown way too big, and
makes the code more readable by using return statements rather than jump
labels.

											
										
										
											2014-08-23 16:02:21 +02:00
+								        if (params->cgroup_path)
-												core: when we cannot add PID to a scope cgroup, log about it

Also, place the scope unit in failed state.

											
										
										
											2015-04-28 12:20:29 +02:00
+								                (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
-												fix an assert when forking

											
										
										
											2010-01-27 05:30:58 +01:00
-												dbus: complete exec status coverage

											
										
										
											2010-07-04 18:49:58 +02:00
+								        exec_status_start(&command->exec_status, pid);
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        *ret = pid;
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								        return 0;
 								}
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								void exec_context_init(ExecContext *c) {
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        ExecDirectoryType i;
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        assert(c);
-												umask: change default umask to 0022 just to be sure, and set it explicitly in all binaries, in order to make sure it is set when started from the terminal

											
										
										
											2011-08-01 20:52:18 +02:00
+								        c->umask = 0022;
-												support chrooting/setting of ioprio when spawning

											
										
										
											2010-01-29 20:46:22 +01:00
+								        c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        c->cpu_sched_policy = SCHED_OTHER;
-												implement proper logging for services

											
										
										
											2010-01-28 02:06:20 +01:00
+								        c->syslog_priority = LOG_DAEMON|LOG_INFO;
-												turn negative options into positive options

											
										
										
											2010-07-05 01:08:13 +02:00
+								        c->syslog_level_prefix = true;
-												service: ignore SIGPIPE by default

											
										
										
											2012-02-09 03:18:04 +01:00
+								        c->ignore_sigpipe = true;
-												time-util: add and use USEC/NSEC_INFINIY

											
										
										
											2014-07-29 12:23:31 +02:00
+								        c->timer_slack_nsec = NSEC_INFINITY;
-												util: introduce PERSONALITY_INVALID as macro for 0xffffffffLU

											
										
										
											2015-05-21 19:48:49 +02:00
+								        c->personality = PERSONALITY_INVALID;
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								        for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                c->directories[i].mode = 0755;
-												capabilities: keep bounding set in non-inverted format.

Change the capability bounding set parser and logic so that the bounding
set is kept as a positive set internally. This means that the set
reflects those capabilities that we want to keep instead of drop.

											
										
										
											2016-01-07 23:00:04 +01:00
+								        c->capability_bounding_set = CAP_ALL;
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								        c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								}
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								void exec_context_done(ExecContext *c) {
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								        unsigned l;
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        ExecDirectoryType i;
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
 								        assert(c);
-												tree-wide: make use of the fact that strv_free() returns NULL

Another Coccinelle patch.

											
										
										
											2015-09-09 23:05:10 +02:00
+								        c->environment = strv_free(c->environment);
 								        c->environment_files = strv_free(c->environment_files);
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
+								        c->pass_environment = strv_free(c->pass_environment);
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
+								        c->unset_environment = strv_free(c->unset_environment);
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
-												tree-wide: update empty-if coccinelle script to cover empty-while and more

Let's also clean up single-line while and for blocks.

											
										
										
											2015-09-09 14:23:02 +02:00
+								        for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
-												tree-wide: use coccinelle to patch a lot of code to use mfree()

This replaces this:

        free(p);
        p = NULL;

by this:

        p = mfree(p);

Change generated using coccinelle. Semantic patch is added to the
sources.

											
										
										
											2015-09-08 18:43:11 +02:00
+								                c->rlimit[l] = mfree(c->rlimit[l]);
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        for (l = 0; l < 3; l++)
 								                c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
-												tree-wide: use coccinelle to patch a lot of code to use mfree()

This replaces this:

        free(p);
        p = NULL;

by this:

        p = mfree(p);

Change generated using coccinelle. Semantic patch is added to the
sources.

											
										
										
											2015-09-08 18:43:11 +02:00
+								        c->working_directory = mfree(c->working_directory);
 								        c->root_directory = mfree(c->root_directory);
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								        c->root_image = mfree(c->root_image);
-												tree-wide: use coccinelle to patch a lot of code to use mfree()

This replaces this:

        free(p);
        p = NULL;

by this:

        p = mfree(p);

Change generated using coccinelle. Semantic patch is added to the
sources.

											
										
										
											2015-09-08 18:43:11 +02:00
+								        c->tty_path = mfree(c->tty_path);
 								        c->syslog_identifier = mfree(c->syslog_identifier);
 								        c->user = mfree(c->user);
 								        c->group = mfree(c->group);
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
-												tree-wide: make use of the fact that strv_free() returns NULL

Another Coccinelle patch.

											
										
										
											2015-09-09 23:05:10 +02:00
+								        c->supplementary_groups = strv_free(c->supplementary_groups);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												tree-wide: use coccinelle to patch a lot of code to use mfree()

This replaces this:

        free(p);
        p = NULL;

by this:

        p = mfree(p);

Change generated using coccinelle. Semantic patch is added to the
sources.

											
										
										
											2015-09-08 18:43:11 +02:00
+								        c->pam_name = mfree(c->pam_name);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												doc,core: Read{Write,Only}Paths= and InaccessiblePaths=

This patch renames Read{Write,Only}Directories= and InaccessibleDirectories=
to Read{Write,Only}Paths= and InaccessiblePaths=, previous names are kept
as aliases but they are not advertised in the documentation.

Renamed variables:
`read_write_dirs` --> `read_write_paths`
`read_only_dirs` --> `read_only_paths`
`inaccessible_dirs` --> `inaccessible_paths`

											
										
										
											2016-07-07 11:17:00 +02:00
+								        c->read_only_paths = strv_free(c->read_only_paths);
 								        c->read_write_paths = strv_free(c->read_write_paths);
 								        c->inaccessible_paths = strv_free(c->inaccessible_paths);
-												dbus: complete exec coverage

											
										
										
											2010-07-04 16:44:58 +02:00
-												core: add ability to define arbitrary bind mounts for services

This adds two new settings BindPaths= and BindReadOnlyPaths=. They allow
defining arbitrary bind mounts specific to particular services. This is
particularly useful for services with RootDirectory= set as this permits making
specific bits of the host directory available to chrooted services.

The two new settings follow the concepts nspawn already possess in --bind= and
--bind-ro=, as well as the .nspawn settings Bind= and BindReadOnly= (and these
latter options should probably be renamed to BindPaths= and BindReadOnlyPaths=
too).

Fixes: #3439

											
										
										
											2016-11-23 22:21:40 +01:00
+								        bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
-												dbus: complete exec coverage

											
										
										
											2010-07-04 16:44:58 +02:00
+								        if (c->cpuset)
 								                CPU_FREE(c->cpuset);
-												execute,util: fix two small memory leaks

											
										
										
											2011-01-06 23:52:17 +01:00
-												tree-wide: use coccinelle to patch a lot of code to use mfree()

This replaces this:

        free(p);
        p = NULL;

by this:

        p = mfree(p);

Change generated using coccinelle. Semantic patch is added to the
sources.

											
										
										
											2015-09-08 18:43:11 +02:00
+								        c->utmp_id = mfree(c->utmp_id);
 								        c->selinux_context = mfree(c->selinux_context);
 								        c->apparmor_profile = mfree(c->apparmor_profile);
-												core: modify resource leak by SmackProcessLabel=

											
										
										
											2017-07-13 06:06:34 +02:00
+								        c->smack_process_label = mfree(c->smack_process_label);
-												core: Add AppArmor profile switching

This permit to switch to a specific apparmor profile when starting a daemon. This
will result in a non operation if apparmor is disabled.
It also add a new build requirement on libapparmor for using this feature.

											
										
										
											2014-02-20 16:19:44 +01:00
-												tree-wide: take benefit of the fact that hashmap_free() returns NULL

And set_free() too.

Another Coccinelle patch.

											
										
										
											2015-09-09 23:12:07 +02:00
+								        c->syscall_filter = set_free(c->syscall_filter);
 								        c->syscall_archs = set_free(c->syscall_archs);
 								        c->address_families = set_free(c->address_families);
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								        for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                c->directories[i].paths = strv_free(c->directories[i].paths);
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								}
 								int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
 								        char **i;
 								        assert(c);
 								        if (!runtime_prefix)
 								                return 0;
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								                _cleanup_free_ char *p;
-												tree-wide: drop NULL sentinel from strjoin

This makes strjoin and strjoina more similar and avoids the useless final
argument.

spatch -I . -I ./src -I ./src/basic -I ./src/basic -I ./src/shared -I ./src/shared -I ./src/network -I ./src/locale -I ./src/login -I ./src/journal -I ./src/journal -I ./src/timedate -I ./src/timesync -I ./src/nspawn -I ./src/resolve -I ./src/resolve -I ./src/systemd -I ./src/core -I ./src/core -I ./src/libudev -I ./src/udev -I ./src/udev/net -I ./src/udev -I ./src/libsystemd/sd-bus -I ./src/libsystemd/sd-event -I ./src/libsystemd/sd-login -I ./src/libsystemd/sd-netlink -I ./src/libsystemd/sd-network -I ./src/libsystemd/sd-hwdb -I ./src/libsystemd/sd-device -I ./src/libsystemd/sd-id128 -I ./src/libsystemd-network --sp-file coccinelle/strjoin.cocci --in-place $(git ls-files src/*.c)

git grep -e '\bstrjoin\b.*NULL' -l|xargs sed -i -r 's/strjoin\((.*), NULL\)/strjoin(\1)/'

This might have missed a few cases (spatch has a really hard time dealing
with _cleanup_ macros), but that's no big issue, they can always be fixed
later.

											
										
										
											2016-10-23 17:43:27 +02:00
+								                p = strjoin(runtime_prefix, "/", *i);
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								                if (!p)
 								                        return -ENOMEM;
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
+								                /* We execute this synchronously, since we need to be sure this is gone when we start the service
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								                 * next. */
-												util: rework rm_rf() logic

- Move to its own file rm-rf.c

- Change parameters into a single flags parameter

- Remove "honour sticky" logic, it's unused these days

											
										
										
											2015-04-04 11:52:57 +02:00
+								                (void) rm_rf(p, REMOVE_ROOT);
-												execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage=

Let's clean up the interaction of StateDirectory= (and friends) to
DynamicUser=1: instead of creating these directories directly below
/var/lib, place them in /var/lib/private instead if DynamicUser=1 is
set, making that directory 0700 and owned by root:root. This way, if a
dynamic UID is later reused, access to the old run's state directory is
prohibited for that user. Then, use file system namespacing inside the
service to make /var/lib/private a readable tmpfs, hiding all state
directories that are not listed in StateDirectory=, and making access to
the actual state directory possible. Mount all directories listed in
StateDirectory= to the same places inside the service (which means
they'll now be mounted into the tmpfs instance). Finally, add a symlink
from the state directory name in /var/lib/ to the one in
/var/lib/private, so that both the host and the service can access the
path under the same location.

Here's an example: let's say a service runs with StateDirectory=foo.
When DynamicUser=0 is set, it will get the following setup, and no
difference between what the unit and what the host sees:

        /var/lib/foo (created as directory)

Now, if DynamicUser=1 is set, we'll instead get this on the host:

        /var/lib/private (created as directory with mode 0700, root:root)
        /var/lib/private/foo (created as directory)
        /var/lib/foo → private/foo (created as symlink)

And from inside the unit:

        /var/lib/private (a tmpfs mount with mode 0755, root:root)
        /var/lib/private/foo (bind mounted from the host)
        /var/lib/foo → private/foo (the same symlink as above)

This takes inspiration from how container trees are protected below
/var/lib/machines: they generally reuse UIDs/GIDs of the host, but
because /var/lib/machines itself is set to 0700 host users cannot access
files in the container tree even if the UIDs/GIDs are reused. However,
for this commit we add one further trick: inside and outside of the unit
/var/lib/private is a different thing: outside it is a plain,
inaccessible directory, and inside it is a world-readable tmpfs mount
with only the whitelisted subdirs below it, bind mounte din.  This
means, from the outside the dir acts as an access barrier, but from the
inside it does not. And the symlink created in /var/lib/foo itself
points across the barrier in both cases, so that root and the unit's
user always have access to these dirs without knowing the details of
this mounting magic.

This logic resolves a major shortcoming of DynamicUser=1 units:
previously they couldn't safely store persistant data. With this change
they can have their own private state, log and data directories, which
they can write to, but which are protected from UID recycling.

With this change, if RootDirectory= or RootImage= are used it is ensured
that the specified state/log/cache directories are always mounted in
from the host. This change of semantics I think is much preferable since
this means the root directory/image logic can be used easily for
read-only resource bundling (as all writable data resides outside of the
image). Note that this is a change of behaviour, but given that we
haven't released any systemd version with StateDirectory= and friends
implemented this should be a safe change to make (in particular as
previously it wasn't clear what would actually happen when used in
combination). Moreover, by making this change we can later add a "+"
modifier to these setings too working similar to the same modifier in
ReadOnlyPaths= and friends, making specified paths relative to the
container itself.

											
										
										
											2017-09-28 18:55:45 +02:00
 								                /* Also destroy any matching subdirectory below /private/. This is done to support DynamicUser=1
 								                 * setups. Note that we don't conditionalize here on that though, as the namespace is same way, and it
 								                 * makes us a bit more robust towards changing unit settings. Or to say this differently: in the worst
 								                 * case this is a NOP. */
 								                free(p);
 								                p = strjoin(runtime_prefix, "/private/", *i);
 								                if (!p)
 								                        return -ENOMEM;
 								                (void) rm_rf(p, REMOVE_ROOT);
-												core: introduce new RuntimeDirectory= and RuntimeDirectoryMode= unit settings

As discussed on the ML these are useful to manage runtime directories
below /run for services.

											
										
										
											2014-03-03 17:14:07 +01:00
+								        }
 								        return 0;
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								}
-												execute: introduce exec_command_done() to free data from static ExecCommand structs

											
										
										
											2010-04-10 17:47:07 +02:00
+								void exec_command_done(ExecCommand *c) {
 								        assert(c);
-												tree-wide: use coccinelle to patch a lot of code to use mfree()

This replaces this:

        free(p);
        p = NULL;

by this:

        p = mfree(p);

Change generated using coccinelle. Semantic patch is added to the
sources.

											
										
										
											2015-09-08 18:43:11 +02:00
+								        c->path = mfree(c->path);
-												execute: introduce exec_command_done() to free data from static ExecCommand structs

											
										
										
											2010-04-10 17:47:07 +02:00
-												tree-wide: make use of the fact that strv_free() returns NULL

Another Coccinelle patch.

											
										
										
											2015-09-09 23:05:10 +02:00
+								        c->argv = strv_free(c->argv);
-												execute: introduce exec_command_done() to free data from static ExecCommand structs

											
										
										
											2010-04-10 17:47:07 +02:00
+								}
 								void exec_command_done_array(ExecCommand *c, unsigned n) {
 								        unsigned i;
 								        for (i = 0; i < n; i++)
 								                exec_command_done(c+i);
 								}
-												core: make exec_command_free_list return NULL

											
										
										
											2014-12-18 18:29:24 +01:00
+								ExecCommand* exec_command_free_list(ExecCommand *c) {
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								        ExecCommand *i;
 								        while ((i = c)) {
-												list: make our list macros a bit easier to use by not requring type spec on each invocation

We can determine the list entry type via the typeof() gcc construct, and
so we should to make the macros much shorter to use.

											
										
										
											2013-10-14 06:10:14 +02:00
+								                LIST_REMOVE(command, c, i);
-												execute: introduce exec_command_done() to free data from static ExecCommand structs

											
										
										
											2010-04-10 17:47:07 +02:00
+								                exec_command_done(i);
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								                free(i);
 								        }
-												core: make exec_command_free_list return NULL

											
										
										
											2014-12-18 18:29:24 +01:00
 								        return NULL;
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								}
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								void exec_command_free_array(ExecCommand **c, unsigned n) {
 								        unsigned i;
-												core: make exec_command_free_list return NULL

											
										
										
											2014-12-18 18:29:24 +01:00
+								        for (i = 0; i < n; i++)
 								                c[i] = exec_command_free_list(c[i]);
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								}
-												env-util: don't include files from src/core/

											
										
										
											2014-12-23 19:04:56 +01:00
+								typedef struct InvalidEnvInfo {
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        Unit *unit;
-												env-util: don't include files from src/core/

											
										
										
											2014-12-23 19:04:56 +01:00
+								        const char *path;
 								} InvalidEnvInfo;
 								static void invalid_env(const char *p, void *userdata) {
 								        InvalidEnvInfo *info = userdata;
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
-												env-util: don't include files from src/core/

											
										
										
											2014-12-23 19:04:56 +01:00
+								}
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								const char* exec_context_fdname(const ExecContext *c, int fd_index) {
 								        assert(c);
 								        switch (fd_index) {
 								        case STDIN_FILENO:
 								                if (c->std_input != EXEC_INPUT_NAMED_FD)
 								                        return NULL;
 								                return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
 								        case STDOUT_FILENO:
 								                if (c->std_output != EXEC_OUTPUT_NAMED_FD)
 								                        return NULL;
 								                return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
 								        case STDERR_FILENO:
 								                if (c->std_error != EXEC_OUTPUT_NAMED_FD)
 								                        return NULL;
 								                return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
 								        default:
 								                return NULL;
 								        }
 								}
 								int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
 								        unsigned i, targets;
-												core/execute: reformat exec_context_named_iofds() for legibility

											
										
										
											2017-01-31 17:23:10 +01:00
+								        const char* stdio_fdname[3];
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								        unsigned n_fds;
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
 								        assert(c);
 								        assert(p);
 								        targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
 								                  (c->std_output == EXEC_OUTPUT_NAMED_FD) +
 								                  (c->std_error == EXEC_OUTPUT_NAMED_FD);
 								        for (i = 0; i < 3; i++)
 								                stdio_fdname[i] = exec_context_fdname(c, i);
-												core: remove the redundancy of 'n_fds' and 'n_storage_fds' in ExecParameters struct

'n_fds' field in the ExecParameters structure was counting the total number of
file descriptors to be passed to a unit.

This counter also includes the number of passed socket fds which is counted by
'n_socket_fds' already.

This patch removes that redundancy by replacing 'n_fds' with
'n_storage_fds'. The new field only counts the fds passed via the storage store
mechanism.  That way each fd is counted at one place only.

Subsequently the patch makes sure to fix code that used 'n_fds' and also wanted
to iterate through all of them by explicitly adding 'n_socket_fds' + 'n_storage_fds'.

Suggested by Lennart.

											
										
										
											2017-06-08 15:41:26 +02:00
+								        n_fds = p->n_storage_fds + p->n_socket_fds;
 								        for (i = 0; i < n_fds  && targets > 0; i++)
-												core/execute: reformat exec_context_named_iofds() for legibility

											
										
										
											2017-01-31 17:23:10 +01:00
+								                if (named_iofds[STDIN_FILENO] < 0 &&
 								                    c->std_input == EXEC_INPUT_NAMED_FD &&
 								                    stdio_fdname[STDIN_FILENO] &&
 								                    streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                        named_iofds[STDIN_FILENO] = p->fds[i];
 								                        targets--;
-												core/execute: reformat exec_context_named_iofds() for legibility

											
										
										
											2017-01-31 17:23:10 +01:00
 								                } else if (named_iofds[STDOUT_FILENO] < 0 &&
 								                           c->std_output == EXEC_OUTPUT_NAMED_FD &&
 								                           stdio_fdname[STDOUT_FILENO] &&
 								                           streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                        named_iofds[STDOUT_FILENO] = p->fds[i];
 								                        targets--;
-												core/execute: reformat exec_context_named_iofds() for legibility

											
										
										
											2017-01-31 17:23:10 +01:00
 								                } else if (named_iofds[STDERR_FILENO] < 0 &&
 								                           c->std_error == EXEC_OUTPUT_NAMED_FD &&
 								                           stdio_fdname[STDERR_FILENO] &&
 								                           streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								                        named_iofds[STDERR_FILENO] = p->fds[i];
 								                        targets--;
 								                }
-												core/execute: reformat exec_context_named_iofds() for legibility

											
										
										
											2017-01-31 17:23:10 +01:00
+								        return targets == 0 ? 0 : -ENOENT;
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								}
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
+								        char **i, **r = NULL;
 								        assert(c);
 								        assert(l);
 								        STRV_FOREACH(i, c->environment_files) {
 								                char *fn;
-												core: fix warning about unsigned variable (#5935)

Fixup for d8c92e8bc7351f553936b5235e1922c18ebd817a.
											
										
										
											2017-05-11 08:15:28 +02:00
+								                int k;
 								                unsigned n;
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
+								                bool ignore = false;
 								                char **p;
-												move _cleanup_ attribute in front of the type

http://lists.freedesktop.org/archives/systemd-devel/2013-April/010510.html

											
										
										
											2013-04-18 09:11:22 +02:00
+								                _cleanup_globfree_ glob_t pglob = {};
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
 								                fn = *i;
 								                if (fn[0] == '-') {
 								                        ignore = true;
-												tree-wide: make ++/-- usage consistent WRT spacing

Throughout the tree there's spurious use of spaces separating ++ and --
operators from their respective operands.  Make ++ and -- operator
consistent with the majority of existing uses; discard the spaces.

											
										
										
											2016-02-23 05:32:04 +01:00
+								                        fn++;
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
+								                }
 								                if (!path_is_absolute(fn)) {
 								                        if (ignore)
 								                                continue;
 								                        strv_free(r);
 								                        return -EINVAL;
 								                }
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                /* Filename supports globbing, take all matching files */
-												execute: filter out "." for ".." in EnvironmentFile= globs too

This doesn't really matter much, only in case somebody would use
something strange like

  EnvironmentFile=/etc/something/.*

Make sure that "." and ".." is not returned by that glob. This makes
all our globbing patterns behave the same.

											
										
										
											2017-04-26 04:54:50 +02:00
+								                k = safe_glob(fn, 0, &pglob);
 								                if (k < 0) {
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                        if (ignore)
 								                                continue;
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                        strv_free(r);
-												execute: filter out "." for ".." in EnvironmentFile= globs too

This doesn't really matter much, only in case somebody would use
something strange like

  EnvironmentFile=/etc/something/.*

Make sure that "." and ".." is not returned by that glob. This makes
all our globbing patterns behave the same.

											
										
										
											2017-04-26 04:54:50 +02:00
+								                        return k;
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                }
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
-												execute: filter out "." for ".." in EnvironmentFile= globs too

This doesn't really matter much, only in case somebody would use
something strange like

  EnvironmentFile=/etc/something/.*

Make sure that "." and ".." is not returned by that glob. This makes
all our globbing patterns behave the same.

											
										
										
											2017-04-26 04:54:50 +02:00
+								                /* When we don't match anything, -ENOENT should be returned */
 								                assert(pglob.gl_pathc > 0);
 								                for (n = 0; n < pglob.gl_pathc; n++) {
-												machinectl: show /etc/os-release information of container in status output

											
										
										
											2014-07-03 17:50:55 +02:00
+								                        k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                        if (k < 0) {
 								                                if (ignore)
 								                                        continue;
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                                strv_free(r);
 								                                return k;
-												replace tabs with spaces in various files

The affected files in this patch had inconsistent use of tabs vs. spaces
for indentation, and this patch eliminates the stray tabs.

Also, the opening brace of sigchld_hdl() in activate.c was moved so the
opening braces are consistent throughout the file.

											
										
										
											2013-07-02 13:24:48 +02:00
+								                        }
-												core/execute: report invalid environment variables from files

Because "export key=val" is not supported by systemd, an error is logged
where the invalid assignment is coming from.

Introduce strv_env_clean_log() to log invalid environment assignments,
where logging is possible and allowed.

parse_env_file_internal() is modified to allow WHITESPACE in keys, to
report the issues later on.

											
										
										
											2013-04-17 15:25:02 +02:00
+								                        /* Log invalid environment variables with filename */
-												env-util: don't include files from src/core/

											
										
										
											2014-12-23 19:04:56 +01:00
+								                        if (p) {
 								                                InvalidEnvInfo info = {
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                                        .unit = unit,
-												env-util: don't include files from src/core/

											
										
										
											2014-12-23 19:04:56 +01:00
+								                                        .path = pglob.gl_pathv[n]
 								                                };
 								                                p = strv_env_clean_with_callback(p, invalid_env, &info);
 								                        }
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                        if (r == NULL)
 								                                r = p;
 								                        else {
 								                                char **m;
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                                m = strv_env_merge(2, r, p);
 								                                strv_free(r);
 								                                strv_free(p);
-												Add _cleanup_globfree_

Fixes a memleak in error path in exec_context_load_environment.

											
										
										
											2013-03-25 00:09:19 +01:00
+								                                if (!m)
-												Added globbing support to EnvironmentFile

This patch allows globbing to be used with EnvironmentFile option.
Example:
EnvironmentFile=/etc/foo.d/*.conf

t. Pekka

											
										
										
											2013-01-02 12:41:52 +01:00
+								                                        return -ENOMEM;
 								                                r = m;
 								                        }
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
+								                }
 								        }
 								        *l = r;
 								        return 0;
 								}
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
+								static bool tty_may_match_dev_console(const char *tty) {
-												use more _cleanup_ macro

											
										
										
											2014-06-24 19:00:32 +02:00
+								        _cleanup_free_ char *active = NULL;
-												tree-wide: fix indentation

											
										
										
											2015-08-06 00:31:09 +02:00
+								        char *console;
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								        if (!tty)
 								                return true;
-												util-lib: add a new skip_dev_prefix() helper

This new helper removes a leading /dev if there is one. We have code
doing this all over the place, let's unify this, and correct it while
we are at it, by using path_startswith() rather than startswith() to
drop the prefix.

											
										
										
											2017-08-09 19:01:18 +02:00
+								        tty = skip_dev_prefix(tty);
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
 								        /* trivial identity? */
 								        if (streq(tty, "console"))
 								                return true;
 								        console = resolve_dev_console(&active);
 								        /* if we could not resolve, assume it may */
 								        if (!console)
 								                return true;
 								        /* "tty0" means the active VC, so it may be the same sometimes */
-												use more _cleanup_ macro

											
										
										
											2014-06-24 19:00:32 +02:00
+								        return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
+								}
 								bool exec_context_may_touch_console(ExecContext *ec) {
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
 								        return (ec->tty_reset ||
 								                ec->tty_vhangup ||
 								                ec->tty_vt_disallocate ||
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
+								                is_terminal_input(ec->std_input) ||
 								                is_terminal_output(ec->std_output) ||
 								                is_terminal_output(ec->std_error)) &&
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								               tty_may_match_dev_console(exec_context_tty_path(ec));
-												core/execute: determine if ExecContext may fiddle with /dev/console

There is some guesswork, but it should work satisfactorily for the
purpose of knowing when to suppress printing of status messages.

											
										
										
											2013-02-28 01:36:55 +01:00
+								}
-												execute: support basic filesystem namespacing

											
										
										
											2010-04-21 22:15:06 +02:00
+								static void strv_fprintf(FILE *f, char **l) {
 								        char **g;
 								        assert(f);
 								        STRV_FOREACH(g, l)
 								                fprintf(f, " %s", *g);
 								}
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
-												core: dump RuntimeDirectories and RuntimeDirectoryMode too

											
										
										
											2015-10-15 21:15:11 +02:00
+								        char **e, **d;
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        unsigned i;
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        ExecDirectoryType dt;
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								        int r;
-												support chrooting/setting of ioprio when spawning

											
										
										
											2010-01-29 20:46:22 +01:00
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								        assert(c);
 								        assert(f);
-												core: general cgroup rework

Replace the very generic cgroup hookup with a much simpler one. With
this change only the high-level cgroup settings remain, the ability to
set arbitrary cgroup attributes is removed, so is support for adding
units to arbitrary cgroup controllers or setting arbitrary paths for
them (especially paths that are different for the various controllers).

This also introduces a new -.slice root slice, that is the parent of
system.slice and friends. This enables easy admin configuration of
root-level cgrouo properties.

This replaces DeviceDeny= by DevicePolicy=, and implicitly adds in
/dev/null, /dev/zero and friends if DeviceAllow= is used (unless this is
turned off by DevicePolicy=).

											
										
										
											2013-06-27 04:14:27 +02:00
+								        prefix = strempty(prefix);
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
 								        fprintf(f,
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								                "%sUMask: %04o\n"
 								                "%sWorkingDirectory: %s\n"
-												execute: allow configuration of O_NONBLOCK flag from .service files

											
										
										
											2010-02-12 02:00:18 +01:00
+								                "%sRootDirectory: %s\n"
-												execute: support basic filesystem namespacing

											
										
										
											2010-04-21 22:15:06 +02:00
+								                "%sNonBlocking: %s\n"
-												exec: add ControlGroupModify= switch to allow changing access mode to cgroups fs

											
										
										
											2011-06-30 00:11:25 +02:00
+								                "%sPrivateTmp: %s\n"
-												exec: introduce PrivateDevices= switch to provide services with a private /dev

Similar to PrivateNetwork=, PrivateTmp= introduce PrivateDevices= that
sets up a private /dev with only the API pseudo-devices like /dev/null,
/dev/zero, /dev/random, but not any physical devices in them.

											
										
										
											2014-01-20 19:54:51 +01:00
+								                "%sPrivateDevices: %s\n"
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                "%sProtectKernelTunables: %s\n"
-												core: make sure to dump ProtectKernelModules= value

											
										
										
											2016-10-09 12:31:51 +02:00
+								                "%sProtectKernelModules: %s\n"
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                "%sProtectControlGroups: %s\n"
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								                "%sPrivateNetwork: %s\n"
 								                "%sPrivateUsers: %s\n"
-												core: rename ReadOnlySystem= to ProtectSystem= and add a third value for also mounting /etc read-only

Also, rename ProtectedHome= to ProtectHome=, to simplify things a bit.

With this in place we now have two neat options ProtectSystem= and
ProtectHome= for protecting the OS itself (and optionally its
configuration), and for protecting the user's data.

											
										
										
											2014-06-04 18:07:55 +02:00
+								                "%sProtectHome: %s\n"
 								                "%sProtectSystem: %s\n"
-												core: add a per-unit setting MountAPIVFS= for mounting /dev, /proc, /sys in conjunction with RootDirectory=

This adds a boolean unit file setting MountAPIVFS=. If set, the three
main API VFS mounts will be mounted for the service. This only has an
effect on RootDirectory=, which it makes a ton times more useful.

(This is basically the /dev + /proc + /sys mounting code posted in the
original #4727, but rebased on current git, and with the automatic logic
replaced by explicit logic controlled by a unit file setting)

											
										
										
											2016-12-22 23:34:35 +01:00
+								                "%sMountAPIVFS: %s\n"
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
+								                "%sIgnoreSIGPIPE: %s\n"
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								                "%sMemoryDenyWriteExecute: %s\n"
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								                "%sRestrictRealtime: %s\n"
 								                "%sKeyringMode: %s\n",
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								                prefix, c->umask,
-												support chrooting/setting of ioprio when spawning

											
										
										
											2010-01-29 20:46:22 +01:00
+								                prefix, c->working_directory ? c->working_directory : "/",
-												execute: allow configuration of O_NONBLOCK flag from .service files

											
										
										
											2010-02-12 02:00:18 +01:00
+								                prefix, c->root_directory ? c->root_directory : "/",
-												execute: support basic filesystem namespacing

											
										
										
											2010-04-21 22:15:06 +02:00
+								                prefix, yes_no(c->non_blocking),
-												exec: add ControlGroupModify= switch to allow changing access mode to cgroups fs

											
										
										
											2011-06-30 00:11:25 +02:00
+								                prefix, yes_no(c->private_tmp),
-												exec: introduce PrivateDevices= switch to provide services with a private /dev

Similar to PrivateNetwork=, PrivateTmp= introduce PrivateDevices= that
sets up a private /dev with only the API pseudo-devices like /dev/null,
/dev/zero, /dev/random, but not any physical devices in them.

											
										
										
											2014-01-20 19:54:51 +01:00
+								                prefix, yes_no(c->private_devices),
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                prefix, yes_no(c->protect_kernel_tunables),
-												core: make sure to dump ProtectKernelModules= value

											
										
										
											2016-10-09 12:31:51 +02:00
+								                prefix, yes_no(c->protect_kernel_modules),
-												core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

											
										
										
											2016-08-22 18:43:59 +02:00
+								                prefix, yes_no(c->protect_control_groups),
-												core: add new PrivateUsers= option to service execution

This setting adds minimal user namespacing support to a service. When set the invoked
processes will run in their own user namespace. Only a trivial mapping will be
set up: the root user/group is mapped to root, and the user/group of the
service will be mapped to itself, everything else is mapped to nobody.

If this setting is used the service runs with no capabilities on the host, but
configurable capabilities within the service.

This setting is particularly useful in conjunction with RootDirectory= as the
need to synchronize /etc/passwd and /etc/group between the host and the service
OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the
user of the service itself. But even outside the RootDirectory= case this
setting is useful to substantially reduce the attack surface of a service.

Example command to test this:

        systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh

This runs a shell as user "foobar". When typing "ps" only processes owned by
"root", by "foobar", and by "nobody" should be visible.

											
										
										
											2016-08-03 18:44:51 +02:00
+								                prefix, yes_no(c->private_network),
 								                prefix, yes_no(c->private_users),
-												core: rename ReadOnlySystem= to ProtectSystem= and add a third value for also mounting /etc read-only

Also, rename ProtectedHome= to ProtectHome=, to simplify things a bit.

With this in place we now have two neat options ProtectSystem= and
ProtectHome= for protecting the OS itself (and optionally its
configuration), and for protecting the user's data.

											
										
										
											2014-06-04 18:07:55 +02:00
+								                prefix, protect_home_to_string(c->protect_home),
 								                prefix, protect_system_to_string(c->protect_system),
-												core: add a per-unit setting MountAPIVFS= for mounting /dev, /proc, /sys in conjunction with RootDirectory=

This adds a boolean unit file setting MountAPIVFS=. If set, the three
main API VFS mounts will be mounted for the service. This only has an
effect on RootDirectory=, which it makes a ton times more useful.

(This is basically the /dev + /proc + /sys mounting code posted in the
original #4727, but rebased on current git, and with the automatic logic
replaced by explicit logic controlled by a unit file setting)

											
										
										
											2016-12-22 23:34:35 +01:00
+								                prefix, yes_no(c->mount_apivfs),
-												core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)

New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.
											
										
										
											2016-06-03 17:58:18 +02:00
+								                prefix, yes_no(c->ignore_sigpipe),
-												execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

											
										
										
											2016-06-23 01:45:45 +02:00
+								                prefix, yes_no(c->memory_deny_write_execute),
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
+								                prefix, yes_no(c->restrict_realtime),
 								                prefix, exec_keyring_mode_to_string(c->keyring_mode));
-												set nice/oom_adjust only when asked for

											
										
										
											2010-01-28 02:53:56 +01:00
-												core: add RootImage= setting for using a specific image file as root directory for a service

This is similar to RootDirectory= but mounts the root file system from a
block device or loopback file instead of another directory.

This reuses the image dissector code now used by nspawn and
gpt-auto-discovery.

											
										
										
											2016-12-23 14:26:05 +01:00
+								        if (c->root_image)
 								                fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
-												execute: load environment files at time of execution, not when we load the service configuration

https://bugzilla.redhat.com/show_bug.cgi?id=661282

											
										
										
											2011-03-04 03:44:43 +01:00
+								        STRV_FOREACH(e, c->environment)
 								                fprintf(f, "%sEnvironment: %s\n", prefix, *e);
 								        STRV_FOREACH(e, c->environment_files)
 								                fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												execute: Add new PassEnvironment= directive

This directive allows passing environment variables from the system
manager to spawned services. Variables in the system manager can be set
inside a container by passing `--set-env=...` options to systemd-spawn.

Tested with an on-disk test.service unit. Tested using multiple variable
names on a single line, with an empty setting to clear the current list
of variables, with non-existing variables.

Tested using `systemd-run -p PassEnvironment=VARNAME` to confirm it
works with transient units.

Confirmed that `systemctl show` will display the PassEnvironment
settings.

Checked that man pages are generated correctly.

No regressions in `make check`.

											
										
										
											2015-09-07 08:06:53 +02:00
+								        STRV_FOREACH(e, c->pass_environment)
 								                fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
-												core: add new UnsetEnvironment= setting for unit files

With this setting we can explicitly unset specific variables for
processes of a unit, as last step of assembling the environment block
for them. This is useful to fix #6407.

While we are at it, greatly expand the documentation on how the
environment block for forked off processes is assembled.

											
										
										
											2017-09-10 12:16:44 +02:00
+								        STRV_FOREACH(e, c->unset_environment)
 								                fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
-												core: allow preserving contents of RuntimeDirectory= over process restart

This introduces RuntimeDirectoryPreserve= option which takes a boolean
argument or 'restart'.

Closes #6087.

											
										
										
											2017-07-17 09:22:25 +02:00
+								        fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								        for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								                fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
 								                STRV_FOREACH(d, c->directories[dt].paths)
 								                        fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
 								        }
-												core: dump RuntimeDirectories and RuntimeDirectoryMode too

											
										
										
											2015-10-15 21:15:11 +02:00
-												set nice/oom_adjust only when asked for

											
										
										
											2010-01-28 02:53:56 +01:00
+								        if (c->nice_set)
 								                fprintf(f,
 								                        "%sNice: %i\n",
 								                        prefix, c->nice);
-												exec: replace OOMAdjust= by OOMScoreAdjust= to follow new kernel interface

This replaces OOMAdjust= by OOMScoreAdjust= in the config files,
breaking compatibility with older unit files. However, this keeps compat
with older kernels which lack the new OOM rework.

											
										
										
											2010-08-31 01:33:39 +02:00
+								        if (c->oom_score_adjust_set)
-												set nice/oom_adjust only when asked for

											
										
										
											2010-01-28 02:53:56 +01:00
+								                fprintf(f,
-												exec: replace OOMAdjust= by OOMScoreAdjust= to follow new kernel interface

This replaces OOMAdjust= by OOMScoreAdjust= in the config files,
breaking compatibility with older unit files. However, this keeps compat
with older kernels which lack the new OOM rework.

											
										
										
											2010-08-31 01:33:39 +02:00
+								                        "%sOOMScoreAdjust: %i\n",
 								                        prefix, c->oom_score_adjust);
-												support chrooting/setting of ioprio when spawning

											
										
										
											2010-01-29 20:46:22 +01:00
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        for (i = 0; i < RLIM_NLIMITS; i++)
-												core: dump soft limits too

											
										
										
											2015-11-28 18:15:03 +01:00
+								                if (c->rlimit[i]) {
 								                        fprintf(f, "%s%s: " RLIM_FMT "\n",
 								                                prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
 								                        fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
 								                                prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
 								                }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
+								        if (c->ioprio_set) {
-												execute: modernizations

											
										
										
											2014-02-19 17:49:00 +01:00
+								                _cleanup_free_ char *class_str = NULL;
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
-												core: do not ignore returned values

											
										
										
											2017-08-06 16:34:55 +02:00
+								                r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
 								                if (r >= 0)
 								                        fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
 								                fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
+								        if (c->cpu_sched_set) {
-												execute: modernizations

											
										
										
											2014-02-19 17:49:00 +01:00
+								                _cleanup_free_ char *policy_str = NULL;
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
-												core: do not ignore returned values

											
										
										
											2017-08-06 16:34:55 +02:00
+								                r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
 								                if (r >= 0)
 								                        fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								                fprintf(f,
-												execute: allow configuration of SCHED_RESET_ON_FORK

											
										
										
											2010-02-02 12:50:04 +01:00
+								                        "%sCPUSchedulingPriority: %i\n"
 								                        "%sCPUSchedulingResetOnFork: %s\n",
 								                        prefix, c->cpu_sched_priority,
 								                        prefix, yes_no(c->cpu_sched_reset_on_fork));
-												tabs to spaces

Skipped bootchart and various files that looked like they should be
kept in sync with external sources.

											
										
										
											2013-01-09 21:03:11 +01:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												dbus: complete exec coverage

											
										
										
											2010-07-04 16:44:58 +02:00
+								        if (c->cpuset) {
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								                fprintf(f, "%sCPUAffinity:", prefix);
-												dbus: complete exec coverage

											
										
										
											2010-07-04 16:44:58 +02:00
+								                for (i = 0; i < c->cpuset_ncpus; i++)
 								                        if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
-												build-sys: minor fixes found with cppcheck

											
										
										
											2013-12-25 19:00:12 +01:00
+								                                fprintf(f, " %u", i);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								                fputs("\n", f);
 								        }
-												time-util: add and use USEC/NSEC_INFINIY

											
										
										
											2014-07-29 12:23:31 +02:00
+								        if (c->timer_slack_nsec != NSEC_INFINITY)
-												Use format patterns for usec_t, pid_t, nsec_t, usec_t

It is nicer to predefine patterns using configure time check instead of
using casts everywhere.

Since we do not need to use any flags, include "%" in the format instead
of excluding it like PRI* macros.

											
										
										
											2013-12-30 23:22:26 +01:00
+								                fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
 								        fprintf(f,
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								                "%sStandardInput: %s\n"
 								                "%sStandardOutput: %s\n"
 								                "%sStandardError: %s\n",
 								                prefix, exec_input_to_string(c->std_input),
 								                prefix, exec_output_to_string(c->std_output),
 								                prefix, exec_output_to_string(c->std_error));
 								        if (c->tty_path)
 								                fprintf(f,
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
+								                        "%sTTYPath: %s\n"
 								                        "%sTTYReset: %s\n"
 								                        "%sTTYVHangup: %s\n"
 								                        "%sTTYVTDisallocate: %s\n",
 								                        prefix, c->tty_path,
 								                        prefix, yes_no(c->tty_reset),
 								                        prefix, yes_no(c->tty_vhangup),
 								                        prefix, yes_no(c->tty_vt_disallocate));
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												execute: make use of IN_SET() where we can

											
										
										
											2017-08-01 11:52:36 +02:00
+								        if (IN_SET(c->std_output,
 								                   EXEC_OUTPUT_SYSLOG,
 								                   EXEC_OUTPUT_KMSG,
 								                   EXEC_OUTPUT_JOURNAL,
 								                   EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 								                   EXEC_OUTPUT_KMSG_AND_CONSOLE,
 								                   EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
 								            IN_SET(c->std_error,
 								                   EXEC_OUTPUT_SYSLOG,
 								                   EXEC_OUTPUT_KMSG,
 								                   EXEC_OUTPUT_JOURNAL,
 								                   EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 								                   EXEC_OUTPUT_KMSG_AND_CONSOLE,
 								                   EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
-												Introduce cleanup functions for cap_free

Unfortunately a different cleanup function is necessary per type,
because cap_t** and char** are incompatible with void**.

											
										
										
											2014-01-01 04:35:54 +01:00
+								                _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
-												core: do not ignore returned values

											
										
										
											2017-08-06 16:34:55 +02:00
+								                r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
 								                if (r >= 0)
 								                        fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
-												core: do not ignore returned values

											
										
										
											2017-08-06 16:34:55 +02:00
+								                r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
 								                if (r >= 0)
 								                        fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
-												shared, core: do not always accept numbers in string lookups

The behaviour of the common name##_from_string conversion is surprising.
It accepts not only the strings from name##_table but also any number
that falls within the range of the table. The order of items in most of
our tables is an internal affair. It should not be visible to the user.

I know of a case where the surprising numeric conversion leads to a crash.

We will allow the direct numeric conversion only for the tables where the
mapping of strings to numeric values has an external meaning. This holds
for the following lookup tables:
 - netlink_family, ioprio_class, ip_tos, sched_policy - their numeric
   values are stable as they are defined by the Linux kernel interface.
 - log_level, log_facility_unshifted - the well-known syslog interface.

We allow the user to use numeric values whose string names systemd does
not know. For instance, the user may want to test a new kernel featuring
a scheduling policy that did not exist when his systemd version was
released. A slightly unpleasant effect of this is that the
name##_to_string conversion cannot return pointers to constant strings
anymore. The strings have to be allocated on demand and freed by the
caller.

											
										
										
											2012-10-30 14:29:38 +01:00
+								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												securebits-util: add secure_bits_{from_string,to_string_alloc}()

											
										
										
											2017-08-07 16:40:25 +02:00
+								        if (c->secure_bits) {
 								                _cleanup_free_ char *str = NULL;
 								                r = secure_bits_to_string_alloc(c->secure_bits, &str);
 								                if (r >= 0)
 								                        fprintf(f, "%sSecure Bits: %s\n", prefix, str);
 								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												capabilities: keep bounding set in non-inverted format.

Change the capability bounding set parser and logic so that the bounding
set is kept as a positive set internally. This means that the set
reflects those capabilities that we want to keep instead of drop.

											
										
										
											2016-01-07 23:00:04 +01:00
+								        if (c->capability_bounding_set != CAP_ALL) {
-												cap-list: add capability_set_{from_string,to_string_alloc}()

											
										
										
											2017-08-07 16:25:11 +02:00
+								                _cleanup_free_ char *str = NULL;
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												cap-list: add capability_set_{from_string,to_string_alloc}()

											
										
										
											2017-08-07 16:25:11 +02:00
+								                r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
 								                if (r >= 0)
 								                        fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
+								        }
 								        if (c->capability_ambient_set != 0) {
-												cap-list: add capability_set_{from_string,to_string_alloc}()

											
										
										
											2017-08-07 16:25:11 +02:00
+								                _cleanup_free_ char *str = NULL;
-												capabilities: added support for ambient capabilities.

This patch adds support for ambient capabilities in service files. The
idea with ambient capabilities is that the execed processes can run with
non-root user and get some inherited capabilities, without having any
need to add the capabilities to the executable file.

You need at least Linux 4.3 to use ambient capabilities. SecureBit
keep-caps is automatically added when you use ambient capabilities and
wish to change the user.

An example system service file might look like this:

[Unit]
Description=Service for testing caps

[Service]
ExecStart=/usr/bin/sleep 10000
User=nobody
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW

After starting the service it has these capabilities:

CapInh: 0000000000003000
CapPrm: 0000000000003000
CapEff: 0000000000003000
CapBnd: 0000003fffffffff
CapAmb: 0000000000003000

											
										
										
											2015-12-31 13:54:44 +01:00
-												cap-list: add capability_set_{from_string,to_string_alloc}()

											
										
										
											2017-08-07 16:25:11 +02:00
+								                r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
 								                if (r >= 0)
 								                        fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        }
 								        if (c->user)
-												execute: handle format strings in User= and other directives

											
										
										
											2010-06-18 23:25:19 +02:00
+								                fprintf(f, "%sUser: %s\n", prefix, c->user);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        if (c->group)
-												execute: handle format strings in User= and other directives

											
										
										
											2010-06-18 23:25:19 +02:00
+								                fprintf(f, "%sGroup: %s\n", prefix, c->group);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												core: add a concept of "dynamic" user ids, that are allocated as long as a service is running

This adds a new boolean setting DynamicUser= to service files. If set, a new
user will be allocated dynamically when the unit is started, and released when
it is stopped. The user ID is allocated from the range 61184..65519. The user
will not be added to /etc/passwd (but an NSS module to be added later should
make it show up in getent passwd).

For now, care should be taken that the service writes no files to disk, since
this might result in files owned by UIDs that might get assigned dynamically to
a different service later on. Later patches will tighten sandboxing in order to
ensure that this cannot happen, except for a few selected directories.

A simple way to test this is:

        systemd-run -p DynamicUser=1 /bin/sleep 99999

											
										
										
											2016-07-14 12:37:28 +02:00
+								        fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
-												core: use strv_isempty to check if supplementary_groups is empty

With the previous commit, we know that it will be NULL if empty, but
it's safe to always use strv_isempty() in case the code changes
in the future.

											
										
										
											2017-10-04 11:33:30 +02:00
+								        if (!strv_isempty(c->supplementary_groups)) {
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								                fprintf(f, "%sSupplementaryGroups:", prefix);
-												execute: support basic filesystem namespacing

											
										
										
											2010-04-21 22:15:06 +02:00
+								                strv_fprintf(f, c->supplementary_groups);
 								                fputs("\n", f);
 								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
+								        if (c->pam_name)
-												execute: handle format strings in User= and other directives

											
										
										
											2010-06-18 23:25:19 +02:00
+								                fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
-												service: optionally call into PAM when dropping priviliges

											
										
										
											2010-06-16 21:54:17 +02:00
-												doc,core: Read{Write,Only}Paths= and InaccessiblePaths=

This patch renames Read{Write,Only}Directories= and InaccessibleDirectories=
to Read{Write,Only}Paths= and InaccessiblePaths=, previous names are kept
as aliases but they are not advertised in the documentation.

Renamed variables:
`read_write_dirs` --> `read_write_paths`
`read_only_dirs` --> `read_only_paths`
`inaccessible_dirs` --> `inaccessible_paths`

											
										
										
											2016-07-07 11:17:00 +02:00
+								        if (strv_length(c->read_write_paths) > 0) {
 								                fprintf(f, "%sReadWritePaths:", prefix);
 								                strv_fprintf(f, c->read_write_paths);
-												execute: support basic filesystem namespacing

											
										
										
											2010-04-21 22:15:06 +02:00
+								                fputs("\n", f);
 								        }
-												doc,core: Read{Write,Only}Paths= and InaccessiblePaths=

This patch renames Read{Write,Only}Directories= and InaccessibleDirectories=
to Read{Write,Only}Paths= and InaccessiblePaths=, previous names are kept
as aliases but they are not advertised in the documentation.

Renamed variables:
`read_write_dirs` --> `read_write_paths`
`read_only_dirs` --> `read_only_paths`
`inaccessible_dirs` --> `inaccessible_paths`

											
										
										
											2016-07-07 11:17:00 +02:00
+								        if (strv_length(c->read_only_paths) > 0) {
 								                fprintf(f, "%sReadOnlyPaths:", prefix);
 								                strv_fprintf(f, c->read_only_paths);
-												execute: support basic filesystem namespacing

											
										
										
											2010-04-21 22:15:06 +02:00
+								                fputs("\n", f);
 								        }
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												doc,core: Read{Write,Only}Paths= and InaccessiblePaths=

This patch renames Read{Write,Only}Directories= and InaccessibleDirectories=
to Read{Write,Only}Paths= and InaccessiblePaths=, previous names are kept
as aliases but they are not advertised in the documentation.

Renamed variables:
`read_write_dirs` --> `read_write_paths`
`read_only_dirs` --> `read_only_paths`
`inaccessible_dirs` --> `inaccessible_paths`

											
										
										
											2016-07-07 11:17:00 +02:00
+								        if (strv_length(c->inaccessible_paths) > 0) {
 								                fprintf(f, "%sInaccessiblePaths:", prefix);
 								                strv_fprintf(f, c->inaccessible_paths);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								                fputs("\n", f);
 								        }
-												execute: add ability to configure the kill signal

											
										
										
											2010-07-10 04:49:37 +02:00
-												core: add ability to define arbitrary bind mounts for services

This adds two new settings BindPaths= and BindReadOnlyPaths=. They allow
defining arbitrary bind mounts specific to particular services. This is
particularly useful for services with RootDirectory= set as this permits making
specific bits of the host directory available to chrooted services.

The two new settings follow the concepts nspawn already possess in --bind= and
--bind-ro=, as well as the .nspawn settings Bind= and BindReadOnly= (and these
latter options should probably be renamed to BindPaths= and BindReadOnlyPaths=
too).

Fixes: #3439

											
										
										
											2016-11-23 22:21:40 +01:00
+								        if (c->n_bind_mounts > 0)
 								                for (i = 0; i < c->n_bind_mounts; i++) {
 								                        fprintf(f, "%s%s: %s:%s:%s\n", prefix,
 								                                c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
 								                                c->bind_mounts[i].source,
 								                                c->bind_mounts[i].destination,
 								                                c->bind_mounts[i].recursive ? "rbind" : "norbind");
 								                }
-												service: optionally, create INIT_PROCESS/DEAD_PROCESS entries for a service

This should fix accounting for pam_limits and suchlike.

https://bugzilla.redhat.com/show_bug.cgi?id=636036

											
										
										
											2010-10-08 16:06:23 +02:00
+								        if (c->utmp_id)
 								                fprintf(f,
 								                        "%sUtmpIdentifier: %s\n",
 								                        prefix, c->utmp_id);
-												exec: Add SELinuxContext configuration item

This permit to let system administrators decide of the domain of a service.
This can be used with templated units to have each service in a différent
domain ( for example, a per customer database, using MLS or anything ),
or can be used to force a non selinux enabled system (jvm, erlang, etc)
to start in a different domain for each service.

											
										
										
											2014-02-06 10:05:16 +01:00
 								        if (c->selinux_context)
 								                fprintf(f,
-												core: store and expose SELinuxContext field normalized as bool + string

											
										
										
											2014-02-17 16:52:52 +01:00
+								                        "%sSELinuxContext: %s%s\n",
 								                        prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
-												core: dump also missed security context

											
										
										
											2017-07-13 06:10:41 +02:00
+								        if (c->apparmor_profile)
 								                fprintf(f,
 								                        "%sAppArmorProfile: %s%s\n",
 								                        prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
 								        if (c->smack_process_label)
 								                fprintf(f,
 								                        "%sSmackProcessLabel: %s%s\n",
 								                        prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
-												util: introduce PERSONALITY_INVALID as macro for 0xffffffffLU

											
										
										
											2015-05-21 19:48:49 +02:00
+								        if (c->personality != PERSONALITY_INVALID)
-												core: add Personality= option for units to set the personality for spawned processes

											
										
										
											2014-02-19 02:15:24 +01:00
+								                fprintf(f,
 								                        "%sPersonality: %s\n",
 								                        prefix, strna(personality_to_string(c->personality)));
-												seccomp: LockPersonality boolean (#6193)

Add LockPersonality boolean to allow locking down personality(2)
system call so that the execution domain can't be changed.
This may be useful to improve security because odd emulations
may be poorly tested and source of vulnerabilities, while
system services shouldn't need any weird personalities.

											
										
										
											2017-07-04 14:48:18 +02:00
+								        fprintf(f,
 								                "%sLockPersonality: %s\n",
 								                prefix, yes_no(c->lock_personality));
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								        if (c->syscall_filter) {
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								                Iterator j;
 								                void *id;
 								                bool first = true;
-												core: fix build without libseccomp

											
										
										
											2014-02-12 18:44:40 +01:00
+								#endif
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
 								                fprintf(f,
-												core: add SystemCallArchitectures= unit setting to allow disabling of non-native
architecture support for system calls

Also, turn system call filter bus properties into complex types instead
of concatenated strings.

											
										
										
											2014-02-13 00:24:00 +01:00
+								                        "%sSystemCallFilter: ",
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								                        prefix);
 								                if (!c->syscall_whitelist)
 								                        fputc('~', f);
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								                SET_FOREACH(id, c->syscall_filter, j) {
 								                        _cleanup_free_ char *name = NULL;
 								                        if (first)
 								                                first = false;
 								                        else
 								                                fputc(' ', f);
-												core: add SystemCallArchitectures= unit setting to allow disabling of non-native
architecture support for system calls

Also, turn system call filter bus properties into complex types instead
of concatenated strings.

											
										
										
											2014-02-13 00:24:00 +01:00
+								                        name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								                        fputs(strna(name), f);
 								                }
-												core: fix build without libseccomp

											
										
										
											2014-02-12 18:44:40 +01:00
+								#endif
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
 								                fputc('\n', f);
 								        }
-												core: add SystemCallArchitectures= unit setting to allow disabling of non-native
architecture support for system calls

Also, turn system call filter bus properties into complex types instead
of concatenated strings.

											
										
										
											2014-02-13 00:24:00 +01:00
+								        if (c->syscall_archs) {
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												core: add SystemCallArchitectures= unit setting to allow disabling of non-native
architecture support for system calls

Also, turn system call filter bus properties into complex types instead
of concatenated strings.

											
										
										
											2014-02-13 00:24:00 +01:00
+								                Iterator j;
 								                void *id;
 								#endif
 								                fprintf(f,
 								                        "%sSystemCallArchitectures:",
 								                        prefix);
-												build-sys: use #if Y instead of #ifdef Y everywhere

The advantage is that is the name is mispellt, cpp will warn us.

$ git grep -Ee "conf.set\('(HAVE|ENABLE)_" -l|xargs sed -r -i "s/conf.set\('(HAVE|ENABLE)_/conf.set10('\1_/"
$ git grep -Ee '#ifn?def (HAVE|ENABLE)' -l|xargs sed -r -i 's/#ifdef (HAVE|ENABLE)/#if \1/; s/#ifndef (HAVE|ENABLE)/#if ! \1/;'
$ git grep -Ee 'if.*defined\(HAVE' -l|xargs sed -i -r 's/defined\((HAVE_[A-Z0-9_]*)\)/\1/g'
$ git grep -Ee 'if.*defined\(ENABLE' -l|xargs sed -i -r 's/defined\((ENABLE_[A-Z0-9_]*)\)/\1/g'
+ manual changes to meson.build

squash! build-sys: use #if Y instead of #ifdef Y everywhere

v2:
- fix incorrect setting of HAVE_LIBIDN2

											
										
										
											2017-10-03 10:41:51 +02:00
+								#if HAVE_SECCOMP
-												core: add SystemCallArchitectures= unit setting to allow disabling of non-native
architecture support for system calls

Also, turn system call filter bus properties into complex types instead
of concatenated strings.

											
										
										
											2014-02-13 00:24:00 +01:00
+								                SET_FOREACH(id, c->syscall_archs, j)
 								                        fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
 								#endif
 								                fputc('\n', f);
 								        }
-												core: add new RestrictNamespaces= unit file setting

This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().

RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.

This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.

											
										
										
											2016-11-02 03:25:19 +01:00
+								        if (exec_context_restrict_namespaces_set(c)) {
 								                _cleanup_free_ char *s = NULL;
 								                r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
 								                if (r >= 0)
 								                        fprintf(f, "%sRestrictNamespaces: %s\n",
 								                                prefix, s);
 								        }
-												tree-wide: check if errno is greater than zero (2)

Compare errno with zero in a way that tells gcc that
(if the condition is true) errno is positive.

											
										
										
											2016-01-11 20:31:14 +01:00
+								        if (c->syscall_errno > 0)
-												core: rework syscall filter

- Allow configuration of an errno error to return from blacklisted
  syscalls, instead of immediately terminating a process.

- Fix parsing logic when libseccomp support is turned off

- Only keep the actual syscall set in the ExecContext, and generate the
  string version only on demand.

											
										
										
											2014-02-12 18:28:21 +01:00
+								                fprintf(f,
 								                        "%sSystemCallErrorNumber: %s\n",
 								                        prefix, strna(errno_to_name(c->syscall_errno)));
-												core: Add AppArmor profile switching

This permit to switch to a specific apparmor profile when starting a daemon. This
will result in a non operation if apparmor is disabled.
It also add a new build requirement on libapparmor for using this feature.

											
										
										
											2014-02-20 16:19:44 +01:00
 								        if (c->apparmor_profile)
 								                fprintf(f,
 								                        "%sAppArmorProfile: %s%s\n",
 								                        prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								}
-												core: introduce new Delegate=yes/no property controlling creation of cgroup subhierarchies

For priviliged units this resource control property ensures that the
processes have all controllers systemd manages enabled.

For unpriviliged services (those with User= set) this ensures that
access rights to the service cgroup is granted to the user in question,
to create further subgroups. Note that this only applies to the
name=systemd hierarchy though, as access to other controllers is not
safe for unpriviliged processes.

Delegate=yes should be set for container scopes where a systemd instance
inside the container shall manage the hierarchies below its own cgroup
and have access to all controllers.

Delegate=yes should also be set for user@.service, so that systemd
--user can run, controlling its own cgroup tree.

This commit changes machined, systemd-nspawn@.service and user@.service
to set this boolean, in order to ensure that container management will
just work, and the user systemd instance can run fine.

											
										
										
											2014-11-05 17:57:23 +01:00
+								bool exec_context_maintains_privileges(ExecContext *c) {
 								        assert(c);
-												treewide: fix typos and remove accidental repetition of words

											
										
										
											2016-07-10 14:48:23 +02:00
+								        /* Returns true if the process forked off would run under
-												core: introduce new Delegate=yes/no property controlling creation of cgroup subhierarchies

For priviliged units this resource control property ensures that the
processes have all controllers systemd manages enabled.

For unpriviliged services (those with User= set) this ensures that
access rights to the service cgroup is granted to the user in question,
to create further subgroups. Note that this only applies to the
name=systemd hierarchy though, as access to other controllers is not
safe for unpriviliged processes.

Delegate=yes should be set for container scopes where a systemd instance
inside the container shall manage the hierarchies below its own cgroup
and have access to all controllers.

Delegate=yes should also be set for user@.service, so that systemd
--user can run, controlling its own cgroup tree.

This commit changes machined, systemd-nspawn@.service and user@.service
to set this boolean, in order to ensure that container management will
just work, and the user systemd instance can run fine.

											
										
										
											2014-11-05 17:57:23 +01:00
+								         * an unchanged UID or as root. */
 								        if (!c->user)
 								                return true;
 								        if (streq(c->user, "root") || streq(c->user, "0"))
 								                return true;
 								        return false;
 								}
-												core: make IOSchedulingClass= and IOSchedulingPriority= settable for transient units

This patch is a bit more complex thant I hoped. In particular the single
IOScheduling= property exposed on the bus is split up into
IOSchedulingClass= and IOSchedulingPriority= (though compat is
retained). Otherwise the asymmetry between setting props and getting
them is a bit too nasty.

Fixes #5613

											
										
										
											2017-06-26 17:40:08 +02:00
+								int exec_context_get_effective_ioprio(ExecContext *c) {
 								        int p;
 								        assert(c);
 								        if (c->ioprio_set)
 								                return c->ioprio;
 								        p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
 								        if (p < 0)
 								                return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
 								        return p;
 								}
-												dbus: complete exec status coverage

											
										
										
											2010-07-04 18:49:58 +02:00
+								void exec_status_start(ExecStatus *s, pid_t pid) {
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        assert(s);
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
-												dbus: complete exec status coverage

											
										
										
											2010-07-04 18:49:58 +02:00
+								        zero(*s);
 								        s->pid = pid;
 								        dual_timestamp_get(&s->start_timestamp);
 								}
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
+								void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
-												dbus: complete exec status coverage

											
										
										
											2010-07-04 18:49:58 +02:00
+								        assert(s);
-												execute: fix losing of start timestamps

Start timestamps were always cleared before saving exit timestamps.
Fix it by removing a condition that makes no sense any way I look at it.

											
										
										
											2011-12-17 01:33:40 +01:00
+								        if (s->pid && s->pid != pid)
-												dbus: complete exec status coverage

											
										
										
											2010-07-04 18:49:58 +02:00
+								                zero(*s);
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        s->pid = pid;
-												core: rename struct timestamp to dual_timestamp to avoid name clash with IP system headers

											
										
										
											2010-07-01 00:26:44 +02:00
+								        dual_timestamp_get(&s->exit_timestamp);
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
-												first attempt at proper service/socket logic

											
										
										
											2010-01-26 04:18:44 +01:00
+								        s->code = code;
 								        s->status = status;
-												service: optionally, create INIT_PROCESS/DEAD_PROCESS entries for a service

This should fix accounting for pam_limits and suchlike.

https://bugzilla.redhat.com/show_bug.cgi?id=636036

											
										
										
											2010-10-08 16:06:23 +02:00
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
+								        if (context) {
 								                if (context->utmp_id)
 								                        utmp_put_dead_process(context->utmp_id, pid, code, status);
-												core: don't reset /dev/console if stdin/stdout/stderr as passed as fd in a transient service

Otherwise we might end resetting /dev/console all the time when a transient service starts or stops.

Fixes #2377
Fixes #2198
Fixes #2061

											
										
										
											2016-01-28 16:25:39 +01:00
+								                exec_context_tty_reset(context, NULL);
-												exec: hangup/reset/deallocate VTs in gettys

Explicitly disconnect all clients from a VT when a getty starts/finishes
(requires TIOCVHANGUP, available in 2.6.29).

Explicitly deallocate getty VTs in order to flush scrollback buffer.

Explicitly reset terminals to a defined state before spawning getty.

											
										
										
											2011-05-18 01:07:31 +02:00
+								        }
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								}
 								void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
 								        char buf[FORMAT_TIMESTAMP_MAX];
 								        assert(s);
 								        assert(f);
 								        if (s->pid <= 0)
 								                return;
-												core: unify how we generate the prefix string when dumping unit state

											
										
										
											2014-08-21 16:15:49 +02:00
+								        prefix = strempty(prefix);
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								        fprintf(f,
-												Use format patterns for usec_t, pid_t, nsec_t, usec_t

It is nicer to predefine patterns using configure time check instead of
using casts everywhere.

Since we do not need to use any flags, include "%" in the format instead
of excluding it like PRI* macros.

											
										
										
											2013-12-30 23:22:26 +01:00
+								                "%sPID: "PID_FMT"\n",
 								                prefix, s->pid);
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
-												core: use the correct APIs to determine whether a dual timestamp is initialized

											
										
										
											2016-07-27 11:50:37 +02:00
+								        if (dual_timestamp_is_set(&s->start_timestamp))
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								                fprintf(f,
 								                        "%sStart Timestamp: %s\n",
-												core: rename struct timestamp to dual_timestamp to avoid name clash with IP system headers

											
										
										
											2010-07-01 00:26:44 +02:00
+								                        prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
-												core: use the correct APIs to determine whether a dual timestamp is initialized

											
										
										
											2016-07-27 11:50:37 +02:00
+								        if (dual_timestamp_is_set(&s->exit_timestamp))
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								                fprintf(f,
 								                        "%sExit Timestamp: %s\n"
 								                        "%sExit Code: %s\n"
 								                        "%sExit Status: %i\n",
-												core: rename struct timestamp to dual_timestamp to avoid name clash with IP system headers

											
										
										
											2010-07-01 00:26:44 +02:00
+								                        prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								                        prefix, sigchld_code_to_string(s->code),
 								                        prefix, s->status);
-												first attempt in implementinging execution logic

											
										
										
											2010-01-23 01:52:57 +01:00
+								}
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
-												core: add minimal templating system

											
										
										
											2010-04-15 03:11:11 +02:00
+								char *exec_command_line(char **argv) {
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
+								        size_t k;
 								        char *n, *p, **a;
 								        bool first = true;
-												core: add minimal templating system

											
										
										
											2010-04-15 03:11:11 +02:00
+								        assert(argv);
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
-												properly terminate strings with NUL byte

											
										
										
											2010-01-27 02:15:54 +01:00
+								        k = 1;
-												core: add minimal templating system

											
										
										
											2010-04-15 03:11:11 +02:00
+								        STRV_FOREACH(a, argv)
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
+								                k += strlen(*a)+3;
-												execute: apply seccomp filters after changing selinux/aa/smack contexts

Seccomp is generally an unprivileged operation, changing security contexts is
most likely associated with some form of policy. Moreover, while seccomp may
influence our own flow of code quite a bit (much more than the security context
change) make sure to apply the seccomp filters immediately before executing the
binary to invoke.

This also moves enforcement of NNP after the security context change, so that
NNP cannot affect it anymore. (However, the security policy now has to permit
the NNP change).

This change has a good chance of breaking current SELinux/AA/SMACK setups, because
the policy might not expect this change of behaviour. However, it's technically
the better choice I think and should hence be applied.

Fixes: #3993

											
										
										
											2016-10-25 15:52:54 +02:00
+								        n = new(char, k);
 								        if (!n)
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
+								                return NULL;
 								        p = n;
-												core: add minimal templating system

											
										
										
											2010-04-15 03:11:11 +02:00
+								        STRV_FOREACH(a, argv) {
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
 								                if (!first)
 								                        *(p++) = ' ';
 								                else
 								                        first = false;
 								                if (strpbrk(*a, WHITESPACE)) {
 								                        *(p++) = '\'';
 								                        p = stpcpy(p, *a);
 								                        *(p++) = '\'';
 								                } else
 								                        p = stpcpy(p, *a);
 								        }
-												properly terminate strings with NUL byte

											
										
										
											2010-01-27 02:15:54 +01:00
+								        *p = 0;
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
+								        /* FIXME: this doesn't really handle arguments that have
 								         * spaces and ticks in them */
 								        return n;
 								}
 								void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
-												use more _cleanup_ macro

											
										
										
											2014-06-24 19:00:32 +02:00
+								        _cleanup_free_ char *cmd = NULL;
-												core: unify how we generate the prefix string when dumping unit state

											
										
										
											2014-08-21 16:15:49 +02:00
+								        const char *prefix2;
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
 								        assert(c);
 								        assert(f);
-												core: unify how we generate the prefix string when dumping unit state

											
										
										
											2014-08-21 16:15:49 +02:00
+								        prefix = strempty(prefix);
-												util: rework strappenda(), and rename it strjoina()

After all it is now much more like strjoin() than strappend(). At the
same time, add support for NULL sentinels, even if they are normally not
necessary.

											
										
										
											2015-02-03 02:05:59 +01:00
+								        prefix2 = strjoina(prefix, "\t");
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
-												core: add minimal templating system

											
										
										
											2010-04-15 03:11:11 +02:00
+								        cmd = exec_command_line(c->argv);
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
+								        fprintf(f,
 								                "%sCommand Line: %s\n",
 								                prefix, cmd ? cmd : strerror(ENOMEM));
-												execute: automatically record start/exit timestamps for forked processes

											
										
										
											2010-04-10 05:03:14 +02:00
+								        exec_status_dump(&c->exec_status, f, prefix2);
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
+								}
 								void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
 								        assert(f);
-												core: unify how we generate the prefix string when dumping unit state

											
										
										
											2014-08-21 16:15:49 +02:00
+								        prefix = strempty(prefix);
-												various cleanups

											
										
										
											2010-01-26 07:02:51 +01:00
 								        LIST_FOREACH(command, c, c)
 								                exec_command_dump(c, f, prefix);
 								}
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
-												execute: simplify appending to execution list

											
										
										
											2010-02-14 01:05:55 +01:00
+								void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
 								        ExecCommand *end;
 								        assert(l);
 								        assert(e);
 								        if (*l) {
-												Spelling Corrections

Just some lame spelling corrections with no functionality.

											
										
										
											2011-02-21 15:32:17 +01:00
+								                /* It's kind of important, that we keep the order here */
-												list: make our list macros a bit easier to use by not requring type spec on each invocation

We can determine the list entry type via the typeof() gcc construct, and
so we should to make the macros much shorter to use.

											
										
										
											2013-10-14 06:10:14 +02:00
+								                LIST_FIND_TAIL(command, *l, end);
 								                LIST_INSERT_AFTER(command, *l, end, e);
-												execute: simplify appending to execution list

											
										
										
											2010-02-14 01:05:55 +01:00
+								        } else
 								              *l = e;
 								}
-												execute: introduce exec_command_set() for easy setting for command lines

											
										
										
											2010-04-10 17:46:41 +02:00
+								int exec_command_set(ExecCommand *c, const char *path, ...) {
 								        va_list ap;
 								        char **l, *p;
 								        assert(c);
 								        assert(path);
 								        va_start(ap, path);
 								        l = strv_new_ap(path, ap);
 								        va_end(ap);
 								        if (!l)
 								                return -ENOMEM;
-												strv: introduce new strv_from_stdarg_alloca() macro to generate a string array from stdarg function parameters

This allows us to turn lists of strings passed in easily into string
arrays without having to allocate memory.

											
										
										
											2013-10-29 19:53:43 +01:00
+								        p = strdup(path);
 								        if (!p) {
-												execute: introduce exec_command_set() for easy setting for command lines

											
										
										
											2010-04-10 17:46:41 +02:00
+								                strv_free(l);
 								                return -ENOMEM;
 								        }
 								        free(c->path);
 								        c->path = p;
 								        strv_free(c->argv);
 								        c->argv = l;
 								        return 0;
 								}
-												swap: introduce Discard property

Process possible "discard" values from /etc/fstab.

											
										
										
											2014-09-24 14:29:05 +02:00
+								int exec_command_append(ExecCommand *c, const char *path, ...) {
-												core: execute - don't leak strv

											
										
										
											2014-09-30 11:34:01 +02:00
+								        _cleanup_strv_free_ char **l = NULL;
-												swap: introduce Discard property

Process possible "discard" values from /etc/fstab.

											
										
										
											2014-09-24 14:29:05 +02:00
+								        va_list ap;
 								        int r;
 								        assert(c);
 								        assert(path);
 								        va_start(ap, path);
 								        l = strv_new_ap(path, ap);
 								        va_end(ap);
 								        if (!l)
 								                return -ENOMEM;
-												ask-password: add support for caching passwords in the kernel keyring

This adds support for caching harddisk passwords in the kernel keyring
if it is available, thus supporting caching without Plymouth being
around.

This is also useful for hooking up "gdm-auto-login" with the collected
boot-time harddisk password, in order to support gnome keyring
passphrase unlocking via the HDD password, if it is the same.

Any passwords added to the kernel keyring this way have a timeout of
2.5min at which time they are purged from the kernel.

											
										
										
											2015-10-07 11:26:10 +02:00
+								        r = strv_extend_strv(&c->argv, l, false);
-												core: execute - don't leak strv

											
										
										
											2014-09-30 11:34:01 +02:00
+								        if (r < 0)
-												swap: introduce Discard property

Process possible "discard" values from /etc/fstab.

											
										
										
											2014-09-24 14:29:05 +02:00
+								                return r;
 								        return 0;
 								}
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								static int exec_runtime_allocate(ExecRuntime **rt) {
 								        if (*rt)
 								                return 0;
 								        *rt = new0(ExecRuntime, 1);
-												core: Forgot to dereference pointer when checking for NULL

Actually we already checked for !rt before, now we'd like to examine
the return value of the memory allocation.

											
										
										
											2013-12-30 00:18:39 +01:00
+								        if (!*rt)
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                return -ENOMEM;
 								        (*rt)->n_ref = 1;
 								        (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
 								        return 0;
 								}
 								int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
 								        int r;
 								        assert(rt);
 								        assert(c);
 								        assert(id);
 								        if (*rt)
 								                return 1;
 								        if (!c->private_network && !c->private_tmp)
 								                return 0;
 								        r = exec_runtime_allocate(rt);
 								        if (r < 0)
 								                return r;
 								        if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
-												execute: make sure JoinsNamespaceOf= doesn't leak ns fds to executed processes

											
										
										
											2016-07-14 13:12:01 +02:00
+								                if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                        return -errno;
 								        }
 								        if (c->private_tmp && !(*rt)->tmp_dir) {
 								                r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
 								                if (r < 0)
 								                        return r;
 								        }
 								        return 1;
 								}
 								ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
 								        assert(r);
 								        assert(r->n_ref > 0);
 								        r->n_ref++;
 								        return r;
 								}
 								ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
 								        if (!r)
 								                return NULL;
 								        assert(r->n_ref > 0);
 								        r->n_ref--;
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								        if (r->n_ref > 0)
 								                return NULL;
 								        free(r->tmp_dir);
 								        free(r->var_tmp_dir);
 								        safe_close_pair(r->netns_storage_socket);
-												tree-wide: use mfree more

											
										
										
											2016-10-17 00:28:30 +02:00
+								        return mfree(r);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								}
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        assert(u);
 								        assert(f);
 								        assert(fds);
 								        if (!rt)
 								                return 0;
 								        if (rt->tmp_dir)
 								                unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);
 								        if (rt->var_tmp_dir)
 								                unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);
 								        if (rt->netns_storage_socket[0] >= 0) {
 								                int copy;
 								                copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
 								                if (copy < 0)
 								                        return copy;
 								                unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
 								        }
 								        if (rt->netns_storage_socket[1] >= 0) {
 								                int copy;
 								                copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
 								                if (copy < 0)
 								                        return copy;
 								                unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
 								        }
 								        return 0;
 								}
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        int r;
 								        assert(rt);
 								        assert(key);
 								        assert(value);
 								        if (streq(key, "tmp-dir")) {
 								                char *copy;
 								                r = exec_runtime_allocate(rt);
 								                if (r < 0)
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        return log_oom();
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
 								                copy = strdup(value);
 								                if (!copy)
 								                        return log_oom();
 								                free((*rt)->tmp_dir);
 								                (*rt)->tmp_dir = copy;
 								        } else if (streq(key, "var-tmp-dir")) {
 								                char *copy;
 								                r = exec_runtime_allocate(rt);
 								                if (r < 0)
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        return log_oom();
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
 								                copy = strdup(value);
 								                if (!copy)
 								                        return log_oom();
 								                free((*rt)->var_tmp_dir);
 								                (*rt)->var_tmp_dir = copy;
 								        } else if (streq(key, "netns-socket-0")) {
 								                int fd;
 								                r = exec_runtime_allocate(rt);
 								                if (r < 0)
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        return log_oom();
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
 								                if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        log_unit_debug(u, "Failed to parse netns socket value: %s", value);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                else {
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								                        safe_close((*rt)->netns_storage_socket[0]);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                        (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
 								                }
 								        } else if (streq(key, "netns-socket-1")) {
 								                int fd;
 								                r = exec_runtime_allocate(rt);
 								                if (r < 0)
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        return log_oom();
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
 								                if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
-												core,network: major per-object logging rework

This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.

Also contains a couple of other logging improvements:

- Drops a couple of strerror() invocations in favour of using %m.

- Not only .mount units now warn if a symlinks exist for the mount
  point already, .automount units do that too, now.

- A few invocations of log_struct() that didn't actually pass any
  additional structured data have been replaced by simpler invocations
  of log_unit_info() and friends.

- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
  that works like LOG_MESSAGE() but prefixes the message with the unit
  name. Similar, there's now LOG_LINK_MESSAGE() and
  LOG_NETDEV_MESSAGE().

- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
  LOG_NETDEV_INTERFACE() macros have been added that generate the
  necessary per object fields. The old log_unit_struct() call has been
  removed in favour of these new macros used in raw log_struct()
  invocations. In addition to removing one more function call this
  allows generated structured log messages that contain two object
  fields, as necessary for example for network interfaces that are
  joined into another network interface, and whose messages shall be
  indexed by both.

- The LOG_ERRNO() macro has been removed, in favour of
  log_struct_errno(). The latter has the benefit of ensuring that %m in
  format strings is properly resolved to the specified error number.

- A number of logging messages have been converted to use
  log_unit_info() instead of log_info()

- The client code in sysv-generator no longer #includes core code from
  src/core/.

- log_unit_full_errno() has been removed, log_unit_full() instead takes
  an errno now, too.

- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
  avoid double evaluation of their parameters

											
										
										
											2015-05-11 20:38:21 +02:00
+								                        log_unit_debug(u, "Failed to parse netns socket value: %s", value);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                else {
-												util: replace close_nointr_nofail() by a more useful safe_close()

safe_close() automatically becomes a NOP when a negative fd is passed,
and returns -1 unconditionally. This makes it easy to write lines like
this:

        fd = safe_close(fd);

Which will close an fd if it is open, and reset the fd variable
correctly.

By making use of this new scheme we can drop a > 200 lines of code that
was required to test for non-negative fds or to reset the closed fd
variable afterwards.

											
										
										
											2014-03-18 19:22:43 +01:00
+								                        safe_close((*rt)->netns_storage_socket[1]);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                        (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
 								                }
 								        } else
 								                return 0;
 								        return 1;
 								}
 								static void *remove_tmpdir_thread(void *p) {
 								        _cleanup_free_ char *path = p;
-												util: rework rm_rf() logic

- Move to its own file rm-rf.c

- Change parameters into a single flags parameter

- Remove "honour sticky" logic, it's unused these days

											
										
										
											2015-04-04 11:52:57 +02:00
+								        (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        return NULL;
 								}
 								void exec_runtime_destroy(ExecRuntime *rt) {
-												execute: free directory path if we fail to remove it because we cannot allocate a thread

											
										
										
											2014-03-03 17:11:39 +01:00
+								        int r;
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								        if (!rt)
 								                return;
 								        /* If there are multiple users of this, let's leave the stuff around */
 								        if (rt->n_ref > 1)
 								                return;
 								        if (rt->tmp_dir) {
 								                log_debug("Spawning thread to nuke %s", rt->tmp_dir);
-												execute: free directory path if we fail to remove it because we cannot allocate a thread

											
										
										
											2014-03-03 17:11:39 +01:00
 								                r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
 								                if (r < 0) {
-												treewide: no need to negate errno for log_*_errno()

It corrrectly handles both positive and negative errno values.

											
										
										
											2014-11-28 13:19:16 +01:00
+								                        log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
-												execute: free directory path if we fail to remove it because we cannot allocate a thread

											
										
										
											2014-03-03 17:11:39 +01:00
+								                        free(rt->tmp_dir);
 								                }
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                rt->tmp_dir = NULL;
 								        }
 								        if (rt->var_tmp_dir) {
 								                log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
-												execute: free directory path if we fail to remove it because we cannot allocate a thread

											
										
										
											2014-03-03 17:11:39 +01:00
 								                r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
 								                if (r < 0) {
-												treewide: no need to negate errno for log_*_errno()

It corrrectly handles both positive and negative errno values.

											
										
										
											2014-11-28 13:19:16 +01:00
+								                        log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
-												execute: free directory path if we fail to remove it because we cannot allocate a thread

											
										
										
											2014-03-03 17:11:39 +01:00
+								                        free(rt->var_tmp_dir);
 								                }
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								                rt->var_tmp_dir = NULL;
 								        }
-												util: replace close_pipe() with new safe_close_pair()

safe_close_pair() is more like safe_close(), except that it handles
pairs of fds, and doesn't make and misleading allusion, as it works
similarly well for socketpairs() as for pipe()s...

											
										
										
											2014-03-24 03:22:44 +01:00
+								        safe_close_pair(rt->netns_storage_socket);
-												service: add the ability for units to join other unit's PrivateNetwork= and PrivateTmp= namespaces

											
										
										
											2013-11-27 20:23:18 +01:00
+								}
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
 								        [EXEC_INPUT_NULL] = "null",
 								        [EXEC_INPUT_TTY] = "tty",
 								        [EXEC_INPUT_TTY_FORCE] = "tty-force",
-												socket: optionally call accept() for incoming connections and spawn one service instance per connection

											
										
										
											2010-04-15 06:19:54 +02:00
+								        [EXEC_INPUT_TTY_FAIL] = "tty-fail",
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        [EXEC_INPUT_SOCKET] = "socket",
 								        [EXEC_INPUT_NAMED_FD] = "fd",
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								};
-												systemctl: introduce systemctl kill

											
										
										
											2010-10-22 16:11:50 +02:00
+								DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        [EXEC_OUTPUT_INHERIT] = "inherit",
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        [EXEC_OUTPUT_NULL] = "null",
-												rework tty handling

We now make sure to run all services in their own session, possibly with
a controlling terminal.

This also extends the service and socket state machines a little.

											
										
										
											2010-04-13 02:06:27 +02:00
+								        [EXEC_OUTPUT_TTY] = "tty",
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								        [EXEC_OUTPUT_SYSLOG] = "syslog",
-												execute: optionally forward program output to /dev/console in addition to syslog/kmsg

											
										
										
											2011-02-15 01:27:53 +01:00
+								        [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
-												execute: s/EXEC_OUTPUT_KERNEL/EXEC_OUTPUT_KMSG/ to follow LOG_TARGET_xxx nomenclature

											
										
										
											2010-05-19 21:49:03 +02:00
+								        [EXEC_OUTPUT_KMSG] = "kmsg",
-												execute: optionally forward program output to /dev/console in addition to syslog/kmsg

											
										
										
											2011-02-15 01:27:53 +01:00
+								        [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
-												journal: introduce log target 'journal' for executed processes

											
										
										
											2012-01-05 23:54:45 +01:00
+								        [EXEC_OUTPUT_JOURNAL] = "journal",
 								        [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
-												core/exec: add a named-descriptor option ("fd") for streams (#4179)

This commit adds a `fd` option to `StandardInput=`,
`StandardOutput=` and `StandardError=` properties in order to
connect standard streams to externally named descriptors provided
by some socket units.

This option looks for a file descriptor named as the corresponding
stream. Custom names can be specified, separated by a colon.
If multiple name-matches exist, the first matching fd will be used.
											
										
										
											2016-10-18 02:05:49 +02:00
+								        [EXEC_OUTPUT_SOCKET] = "socket",
 								        [EXEC_OUTPUT_NAMED_FD] = "fd",
-												greatly extend what we enforce as process properties

											
										
										
											2010-01-30 01:55:42 +01:00
+								};
 								DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
-												core: optionally create LOGIN_PROCESS or USER_PROCESS utmp entries

When generating utmp/wtmp entries, optionally add both LOGIN_PROCESS and
INIT_PROCESS entries or even all three of LOGIN_PROCESS, INIT_PROCESS
and USER_PROCESS entries, instead of just a single INIT_PROCESS entry.

With this change systemd may be used to not only invoke a getty directly
in a SysV-compliant way but alternatively also a login(1) implementation
or even forego getty and login entirely, and invoke arbitrary shells in
a way that they appear in who(1) or w(1).

This is preparation for a later commit that adds a "machinectl shell"
operation to invoke a shell in a container, in a way that is compatible
with who(1) and w(1).

											
										
										
											2015-08-23 13:14:04 +02:00
 								static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
 								        [EXEC_UTMP_INIT] = "init",
 								        [EXEC_UTMP_LOGIN] = "login",
 								        [EXEC_UTMP_USER] = "user",
 								};
 								DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
-												core: allow preserving contents of RuntimeDirectory= over process restart

This introduces RuntimeDirectoryPreserve= option which takes a boolean
argument or 'restart'.

Closes #6087.

											
										
										
											2017-07-17 09:22:25 +02:00
 								static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
 								        [EXEC_PRESERVE_NO] = "no",
 								        [EXEC_PRESERVE_YES] = "yes",
 								        [EXEC_PRESERVE_RESTART] = "restart",
 								};
 								DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
-												core: usually our enum's _INVALID and _MAX special values are named after the full type

In most cases we followed the rule that the special _INVALID and _MAX
values we use in our enums use the full type name as prefix (in contrast
to regular values that we often make shorter), do so for
ExecDirectoryType as well.

No functional changes, just a little bit of renaming to make this code
more like the rest.

											
										
										
											2017-09-28 16:58:43 +02:00
+								static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
-												core: add {State,Cache,Log,Configuration}Directory= (#6384)

This introduces {State,Cache,Log,Configuration}Directory= those are
similar to RuntimeDirectory=. They create the directories under
/var/lib, /var/cache/, /var/log, or /etc, respectively, with the mode
specified in {State,Cache,Log,Configuration}DirectoryMode=.

This also fixes #6391.
											
										
										
											2017-07-18 14:34:52 +02:00
+								        [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
 								        [EXEC_DIRECTORY_STATE] = "StateDirectory",
 								        [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
 								        [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
 								        [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
 								};
 								DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
-												core: add new per-unit setting KeyringMode= for controlling kernel keyring setup

Usually, it's a good thing that we isolate the kernel session keyring
for the various services and disconnect them from the user keyring.
However, in case of the cryptsetup key caching we actually want that
multiple instances of the cryptsetup service can share the keys in the
root user's user keyring, hence we need to be able to disable this logic
for them.

This adds KeyringMode=inherit|private|shared:

    inherit: don't do any keyring magic (this is the default in systemd --user)
    private: a private keyring as before (default in systemd --system)
    shared: the new setting

											
										
										
											2017-09-14 21:19:05 +02:00
 								static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
 								        [EXEC_KEYRING_INHERIT] = "inherit",
 								        [EXEC_KEYRING_PRIVATE] = "private",
 								        [EXEC_KEYRING_SHARED] = "shared",
 								};
 								DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);