diff --git a/Makefile.am b/Makefile.am index f517f19df5..4b60735515 100644 --- a/Makefile.am +++ b/Makefile.am @@ -805,7 +805,9 @@ libsystemd_shared_la_SOURCES = \ src/shared/ring.c \ src/shared/ring.h \ src/shared/async.c \ - src/shared/async.h + src/shared/async.h \ + src/shared/eventfd-util.c \ + src/shared/eventfd-util.h nodist_libsystemd_shared_la_SOURCES = \ src/shared/errno-from-name.h \ diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 0cd476cd9e..a1d77244f8 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -84,6 +84,7 @@ #include "def.h" #include "rtnl-util.h" #include "udev-util.h" +#include "eventfd-util.h" #include "blkid-util.h" #include "gpt.h" #include "siphash24.h" @@ -2642,6 +2643,8 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) { return r; } +static void nop_handler(int sig) {} + int main(int argc, char *argv[]) { _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL; @@ -2653,8 +2656,8 @@ int main(int argc, char *argv[]) { const char *console = NULL; char veth_name[IFNAMSIZ]; bool secondary = false; + sigset_t mask, mask_chld; pid_t pid = 0; - sigset_t mask; log_parse_environment(); log_open(); @@ -2816,36 +2819,44 @@ int main(int argc, char *argv[]) { sd_notify(0, "READY=1"); assert_se(sigemptyset(&mask) == 0); + assert_se(sigemptyset(&mask_chld) == 0); + sigaddset(&mask_chld, SIGCHLD); sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1); assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0); for (;;) { ContainerStatus container_status; - int parent_ready_fd = -1, child_ready_fd = -1; - eventfd_t x; + int eventfds[2] = { -1, -1 }; + struct sigaction sa = { + .sa_handler = nop_handler, + .sa_flags = SA_NOCLDSTOP, + }; - parent_ready_fd = eventfd(0, EFD_CLOEXEC); - if (parent_ready_fd < 0) { - log_error("Failed to create event fd: %m"); + /* Child can be killed before execv(), so handle SIGCHLD + * in order to interrupt parent's blocking calls and + * give it a chance to call wait() and terminate. */ + r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL); + if (r < 0) { + log_error("Failed to change the signal mask: %m"); goto finish; } - child_ready_fd = eventfd(0, EFD_CLOEXEC); - if (child_ready_fd < 0) { - log_error("Failed to create event fd: %m"); + r = sigaction(SIGCHLD, &sa, NULL); + if (r < 0) { + log_error("Failed to install SIGCHLD handler: %m"); goto finish; } - pid = syscall(__NR_clone, - SIGCHLD|CLONE_NEWNS| - (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)| - (arg_private_network ? CLONE_NEWNET : 0), NULL); + pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS| + (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)| + (arg_private_network ? CLONE_NEWNET : 0), eventfds); if (pid < 0) { if (errno == EINVAL) log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m"); else log_error("clone() failed: %m"); + r = pid; goto finish; } @@ -2986,8 +2997,11 @@ int main(int argc, char *argv[]) { /* Tell the parent that we are ready, and that * it can cgroupify us to that we lack access * to certain devices and resources. */ - eventfd_write(child_ready_fd, 1); - child_ready_fd = safe_close(child_ready_fd); + r = eventfd_send_state(eventfds[1], + EVENTFD_CHILD_SUCCEEDED); + eventfds[1] = safe_close(eventfds[1]); + if (r < 0) + goto child_fail; if (chdir(arg_directory) < 0) { log_error("chdir(%s) failed: %m", arg_directory); @@ -3089,8 +3103,10 @@ int main(int argc, char *argv[]) { env_use = (char**) envp; /* Wait until the parent is ready with the setup, too... */ - eventfd_read(parent_ready_fd, &x); - parent_ready_fd = safe_close(parent_ready_fd); + r = eventfd_parent_succeeded(eventfds[0]); + eventfds[0] = safe_close(eventfds[0]); + if (r < 0) + goto child_fail; if (arg_boot) { char **a; @@ -3121,17 +3137,27 @@ int main(int argc, char *argv[]) { log_error("execv() failed: %m"); child_fail: + /* Tell the parent that the setup failed, so he + * can clean up resources and terminate. */ + if (eventfds[1] != -1) + eventfd_send_state(eventfds[1], + EVENTFD_CHILD_FAILED); _exit(EXIT_FAILURE); } fdset_free(fds); fds = NULL; - /* Wait until the child reported that it is ready with - * all it needs to do with privileges. After we got - * the notification we can make the process join its - * cgroup which might limit what it can do */ - eventfd_read(child_ready_fd, &x); + /* Wait for the child event: + * If EVENTFD_CHILD_FAILED, the child will terminate soon. + * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that + * it is ready with all it needs to do with priviliges. + * After we got the notification we can make the process + * join its cgroup which might limit what it can do */ + r = eventfd_child_succeeded(eventfds[1]); + eventfds[1] = safe_close(eventfds[1]); + if (r < 0) + goto check_container_status; r = register_machine(pid); if (r < 0) @@ -3153,10 +3179,25 @@ int main(int argc, char *argv[]) { if (r < 0) goto finish; + /* Block SIGCHLD here, before notifying child. + * process_pty() will handle it with the other signals. */ + r = sigprocmask(SIG_BLOCK, &mask_chld, NULL); + if (r < 0) + goto finish; + + /* Reset signal to default */ + r = default_signals(SIGCHLD, -1); + if (r < 0) + goto finish; + /* Notify the child that the parent is ready with all - * its setup, and thtat the child can now hand over + * its setup, and that the child can now hand over * control to the code to run inside the container. */ - eventfd_write(parent_ready_fd, 1); + r = eventfd_send_state(eventfds[0], + EVENTFD_PARENT_SUCCEEDED); + eventfds[0] = safe_close(eventfds[0]); + if (r < 0) + goto finish; k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3); if (k < 0) { @@ -3170,6 +3211,7 @@ int main(int argc, char *argv[]) { /* Kill if it is not dead yet anyway */ terminate_machine(pid); +check_container_status: /* Redundant, but better safe than sorry */ kill(pid, SIGKILL); diff --git a/src/shared/eventfd-util.c b/src/shared/eventfd-util.c new file mode 100644 index 0000000000..27b7cf788f --- /dev/null +++ b/src/shared/eventfd-util.c @@ -0,0 +1,169 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2014 Djalal Harouni + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include +#include + +#include "eventfd-util.h" +#include "util.h" + + +/* + * Use this to create processes that need to setup a full context + * and sync it with their parents using cheap mechanisms. + * + * This will create two blocking eventfd(s). A pair for the parent and + * the other for the child so they can be used as a notify mechanism. + * Each process will gets its copy of the parent and child eventfds. + * + * This is useful in case: + * 1) If the parent fails or dies, the child must die. + * 2) Child will install PR_SET_PDEATHSIG as soon as possible. + * 3) Parent and child need to sync using less resources. + * 4) If parent is not able to install a SIGCHLD handler: + * parent will wait using a blocking eventfd_read() or + * eventfd_child_succeeded() call on the child eventfd. + * + * * If the child setup succeeded, child should notify with an + * EVENTFD_CHILD_SUCCEEDED, parent will continue. + * * If the child setup failed, child should notify with an + * EVENTFD_CHILD_FAILED before any _exit(). This avoids blocking + * the parent. + * + * 5) If parent is able to install a SIGCHLD handler: + * An empty signal handler without SA_RESTART will do it, since the + * blocking eventfd_read() or eventfd_parent_succeeded() of the + * parent will be interrupted by SIGCHLD and the call will fail with + * EINTR. This is useful in case the child dies abnormaly and did + * not have a chance to notify its parent using EVENTFD_CHILD_FAILED. + * + * 6) Call wait*() in the main instead of the signal handler in order + * to: 1) reduce side effects and 2) have a better handling for + * child termination in order to reduce various race conditions. + * + * + * The return value of clone_with_eventfd() is the same of clone(). + * On success the eventfds[] will contain the two eventfd(s). These + * file descriptors can be closed later with safe_close(). On failure, + * a negative value is returned in the caller's context, and errno will + * be set appropriately. + * + * Extra preliminary work: + * 1) Child can wait before starting its setup by using the + * eventfd_recv_start() call on the parent eventfd, in that case the + * parent must notify with EVENTFD_START, after doing any preliminary + * work. + * + * Note: this function depends on systemd internal functions + * safe_close() and it should be used only by direct binaries, no + * libraries. + */ +pid_t clone_with_eventfd(int flags, int eventfds[2]) { + pid_t pid; + + assert(eventfds); + + eventfds[0] = eventfd(EVENTFD_INIT, EFD_CLOEXEC); + if (eventfds[0] < 0) + return -1; + + eventfds[1] = eventfd(EVENTFD_INIT, EFD_CLOEXEC); + if (eventfds[1] < 0) + goto err_eventfd0; + + pid = syscall(__NR_clone, flags, NULL); + if (pid < 0) + goto err_eventfd1; + + return pid; + +err_eventfd1: + eventfds[1] = safe_close(eventfds[1]); +err_eventfd0: + eventfds[0] = safe_close(eventfds[0]); + return -1; +} + +int eventfd_send_state(int efd, eventfd_t s) { + return eventfd_write(efd, s); +} + +/* + * Receive an eventfd state on the eventfd file descriptor. + * + * If the third argument is set to a value other than zero, then this + * function will compare the received value with this argument and set + * the return value. + * + * On success return 0. On error, -1 will be returned, and errno will + * be set appropriately. + */ +int eventfd_recv_state(int efd, eventfd_t *e, eventfd_t s) { + int ret; + + ret = eventfd_read(efd, e); + if (ret < 0) + return ret; + else if (s != 0 && *e != s) { + errno = EINVAL; + return -1; + } + + return 0; +} + +/* + * Receive the EVENTFD_START state on the eventfd file descriptor. + * + * On Success return 0. On error, -1 will be returned, and errno will + * be set appropriately. + */ +int eventfd_recv_start(int efd) { + eventfd_t e = EVENTFD_INIT; + return eventfd_recv_state(efd, &e, EVENTFD_START); +} + +/* + * Receive the EVENTFD_PARENT_SUCCEEDED state on the eventfd file + * descriptor. + * + * On Success return 0. On error, -1 will be returned, and errno will + * be set appropriately. + */ +int eventfd_parent_succeeded(int efd) { + eventfd_t e = EVENTFD_INIT; + return eventfd_recv_state(efd, &e, EVENTFD_PARENT_SUCCEEDED); +} + +/* + * Receive the EVENTFD_CHILD_SUCCEEDED state on the eventfd file + * descriptor. + * + * On Success return 0. On error, -1 will be returned, and errno will + * be set appropriately. + */ +int eventfd_child_succeeded(int efd) { + eventfd_t e = EVENTFD_INIT; + return eventfd_recv_state(efd, &e, EVENTFD_CHILD_SUCCEEDED); +} diff --git a/src/shared/eventfd-util.h b/src/shared/eventfd-util.h new file mode 100644 index 0000000000..0120f0409e --- /dev/null +++ b/src/shared/eventfd-util.h @@ -0,0 +1,43 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +#pragma once + +/*** + This file is part of systemd. + + Copyright 2014 Djalal Harouni + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include + +enum { + EVENTFD_INIT, + EVENTFD_START, + EVENTFD_PARENT_SUCCEEDED, + EVENTFD_PARENT_FAILED, + EVENTFD_CHILD_SUCCEEDED, + EVENTFD_CHILD_FAILED, +}; + +pid_t clone_with_eventfd(int flags, int eventfds[2]); + +int eventfd_send_state(int efd, eventfd_t s); +int eventfd_recv_state(int efd, eventfd_t *e, eventfd_t s); + +int eventfd_recv_start(int efd); +int eventfd_parent_succeeded(int efd); +int eventfd_child_succeeded(int efd);