summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.am4
-rw-r--r--src/nspawn/nspawn.c92
-rw-r--r--src/shared/eventfd-util.c169
-rw-r--r--src/shared/eventfd-util.h43
4 files changed, 282 insertions, 26 deletions
diff --git a/Makefile.am b/Makefile.am
index f517f19df..4b6073551 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -805,7 +805,9 @@ libsystemd_shared_la_SOURCES = \
src/shared/ring.c \
src/shared/ring.h \
src/shared/async.c \
- src/shared/async.h
+ src/shared/async.h \
+ src/shared/eventfd-util.c \
+ src/shared/eventfd-util.h
nodist_libsystemd_shared_la_SOURCES = \
src/shared/errno-from-name.h \
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 0cd476cd9..a1d77244f 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -84,6 +84,7 @@
#include "def.h"
#include "rtnl-util.h"
#include "udev-util.h"
+#include "eventfd-util.h"
#include "blkid-util.h"
#include "gpt.h"
#include "siphash24.h"
@@ -2642,6 +2643,8 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) {
return r;
}
+static void nop_handler(int sig) {}
+
int main(int argc, char *argv[]) {
_cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
@@ -2653,8 +2656,8 @@ int main(int argc, char *argv[]) {
const char *console = NULL;
char veth_name[IFNAMSIZ];
bool secondary = false;
+ sigset_t mask, mask_chld;
pid_t pid = 0;
- sigset_t mask;
log_parse_environment();
log_open();
@@ -2816,36 +2819,44 @@ int main(int argc, char *argv[]) {
sd_notify(0, "READY=1");
assert_se(sigemptyset(&mask) == 0);
+ assert_se(sigemptyset(&mask_chld) == 0);
+ sigaddset(&mask_chld, SIGCHLD);
sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
for (;;) {
ContainerStatus container_status;
- int parent_ready_fd = -1, child_ready_fd = -1;
- eventfd_t x;
-
- parent_ready_fd = eventfd(0, EFD_CLOEXEC);
- if (parent_ready_fd < 0) {
- log_error("Failed to create event fd: %m");
+ int eventfds[2] = { -1, -1 };
+ struct sigaction sa = {
+ .sa_handler = nop_handler,
+ .sa_flags = SA_NOCLDSTOP,
+ };
+
+ /* Child can be killed before execv(), so handle SIGCHLD
+ * in order to interrupt parent's blocking calls and
+ * give it a chance to call wait() and terminate. */
+ r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
+ if (r < 0) {
+ log_error("Failed to change the signal mask: %m");
goto finish;
}
- child_ready_fd = eventfd(0, EFD_CLOEXEC);
- if (child_ready_fd < 0) {
- log_error("Failed to create event fd: %m");
+ r = sigaction(SIGCHLD, &sa, NULL);
+ if (r < 0) {
+ log_error("Failed to install SIGCHLD handler: %m");
goto finish;
}
- pid = syscall(__NR_clone,
- SIGCHLD|CLONE_NEWNS|
- (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
- (arg_private_network ? CLONE_NEWNET : 0), NULL);
+ pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
+ (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
+ (arg_private_network ? CLONE_NEWNET : 0), eventfds);
if (pid < 0) {
if (errno == EINVAL)
log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
else
log_error("clone() failed: %m");
+ r = pid;
goto finish;
}
@@ -2986,8 +2997,11 @@ int main(int argc, char *argv[]) {
/* Tell the parent that we are ready, and that
* it can cgroupify us to that we lack access
* to certain devices and resources. */
- eventfd_write(child_ready_fd, 1);
- child_ready_fd = safe_close(child_ready_fd);
+ r = eventfd_send_state(eventfds[1],
+ EVENTFD_CHILD_SUCCEEDED);
+ eventfds[1] = safe_close(eventfds[1]);
+ if (r < 0)
+ goto child_fail;
if (chdir(arg_directory) < 0) {
log_error("chdir(%s) failed: %m", arg_directory);
@@ -3089,8 +3103,10 @@ int main(int argc, char *argv[]) {
env_use = (char**) envp;
/* Wait until the parent is ready with the setup, too... */
- eventfd_read(parent_ready_fd, &x);
- parent_ready_fd = safe_close(parent_ready_fd);
+ r = eventfd_parent_succeeded(eventfds[0]);
+ eventfds[0] = safe_close(eventfds[0]);
+ if (r < 0)
+ goto child_fail;
if (arg_boot) {
char **a;
@@ -3121,17 +3137,27 @@ int main(int argc, char *argv[]) {
log_error("execv() failed: %m");
child_fail:
+ /* Tell the parent that the setup failed, so he
+ * can clean up resources and terminate. */
+ if (eventfds[1] != -1)
+ eventfd_send_state(eventfds[1],
+ EVENTFD_CHILD_FAILED);
_exit(EXIT_FAILURE);
}
fdset_free(fds);
fds = NULL;
- /* Wait until the child reported that it is ready with
- * all it needs to do with privileges. After we got
- * the notification we can make the process join its
- * cgroup which might limit what it can do */
- eventfd_read(child_ready_fd, &x);
+ /* Wait for the child event:
+ * If EVENTFD_CHILD_FAILED, the child will terminate soon.
+ * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
+ * it is ready with all it needs to do with priviliges.
+ * After we got the notification we can make the process
+ * join its cgroup which might limit what it can do */
+ r = eventfd_child_succeeded(eventfds[1]);
+ eventfds[1] = safe_close(eventfds[1]);
+ if (r < 0)
+ goto check_container_status;
r = register_machine(pid);
if (r < 0)
@@ -3153,10 +3179,25 @@ int main(int argc, char *argv[]) {
if (r < 0)
goto finish;
+ /* Block SIGCHLD here, before notifying child.
+ * process_pty() will handle it with the other signals. */
+ r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
+ if (r < 0)
+ goto finish;
+
+ /* Reset signal to default */
+ r = default_signals(SIGCHLD, -1);
+ if (r < 0)
+ goto finish;
+
/* Notify the child that the parent is ready with all
- * its setup, and thtat the child can now hand over
+ * its setup, and that the child can now hand over
* control to the code to run inside the container. */
- eventfd_write(parent_ready_fd, 1);
+ r = eventfd_send_state(eventfds[0],
+ EVENTFD_PARENT_SUCCEEDED);
+ eventfds[0] = safe_close(eventfds[0]);
+ if (r < 0)
+ goto finish;
k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
if (k < 0) {
@@ -3170,6 +3211,7 @@ int main(int argc, char *argv[]) {
/* Kill if it is not dead yet anyway */
terminate_machine(pid);
+check_container_status:
/* Redundant, but better safe than sorry */
kill(pid, SIGKILL);
diff --git a/src/shared/eventfd-util.c b/src/shared/eventfd-util.c
new file mode 100644
index 000000000..27b7cf788
--- /dev/null
+++ b/src/shared/eventfd-util.c
@@ -0,0 +1,169 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+ This file is part of systemd.
+
+ Copyright 2014 Djalal Harouni
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <assert.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/eventfd.h>
+#include <sys/syscall.h>
+
+#include "eventfd-util.h"
+#include "util.h"
+
+
+/*
+ * Use this to create processes that need to setup a full context
+ * and sync it with their parents using cheap mechanisms.
+ *
+ * This will create two blocking eventfd(s). A pair for the parent and
+ * the other for the child so they can be used as a notify mechanism.
+ * Each process will gets its copy of the parent and child eventfds.
+ *
+ * This is useful in case:
+ * 1) If the parent fails or dies, the child must die.
+ * 2) Child will install PR_SET_PDEATHSIG as soon as possible.
+ * 3) Parent and child need to sync using less resources.
+ * 4) If parent is not able to install a SIGCHLD handler:
+ * parent will wait using a blocking eventfd_read() or
+ * eventfd_child_succeeded() call on the child eventfd.
+ *
+ * * If the child setup succeeded, child should notify with an
+ * EVENTFD_CHILD_SUCCEEDED, parent will continue.
+ * * If the child setup failed, child should notify with an
+ * EVENTFD_CHILD_FAILED before any _exit(). This avoids blocking
+ * the parent.
+ *
+ * 5) If parent is able to install a SIGCHLD handler:
+ * An empty signal handler without SA_RESTART will do it, since the
+ * blocking eventfd_read() or eventfd_parent_succeeded() of the
+ * parent will be interrupted by SIGCHLD and the call will fail with
+ * EINTR. This is useful in case the child dies abnormaly and did
+ * not have a chance to notify its parent using EVENTFD_CHILD_FAILED.
+ *
+ * 6) Call wait*() in the main instead of the signal handler in order
+ * to: 1) reduce side effects and 2) have a better handling for
+ * child termination in order to reduce various race conditions.
+ *
+ *
+ * The return value of clone_with_eventfd() is the same of clone().
+ * On success the eventfds[] will contain the two eventfd(s). These
+ * file descriptors can be closed later with safe_close(). On failure,
+ * a negative value is returned in the caller's context, and errno will
+ * be set appropriately.
+ *
+ * Extra preliminary work:
+ * 1) Child can wait before starting its setup by using the
+ * eventfd_recv_start() call on the parent eventfd, in that case the
+ * parent must notify with EVENTFD_START, after doing any preliminary
+ * work.
+ *
+ * Note: this function depends on systemd internal functions
+ * safe_close() and it should be used only by direct binaries, no
+ * libraries.
+ */
+pid_t clone_with_eventfd(int flags, int eventfds[2]) {
+ pid_t pid;
+
+ assert(eventfds);
+
+ eventfds[0] = eventfd(EVENTFD_INIT, EFD_CLOEXEC);
+ if (eventfds[0] < 0)
+ return -1;
+
+ eventfds[1] = eventfd(EVENTFD_INIT, EFD_CLOEXEC);
+ if (eventfds[1] < 0)
+ goto err_eventfd0;
+
+ pid = syscall(__NR_clone, flags, NULL);
+ if (pid < 0)
+ goto err_eventfd1;
+
+ return pid;
+
+err_eventfd1:
+ eventfds[1] = safe_close(eventfds[1]);
+err_eventfd0:
+ eventfds[0] = safe_close(eventfds[0]);
+ return -1;
+}
+
+int eventfd_send_state(int efd, eventfd_t s) {
+ return eventfd_write(efd, s);
+}
+
+/*
+ * Receive an eventfd state on the eventfd file descriptor.
+ *
+ * If the third argument is set to a value other than zero, then this
+ * function will compare the received value with this argument and set
+ * the return value.
+ *
+ * On success return 0. On error, -1 will be returned, and errno will
+ * be set appropriately.
+ */
+int eventfd_recv_state(int efd, eventfd_t *e, eventfd_t s) {
+ int ret;
+
+ ret = eventfd_read(efd, e);
+ if (ret < 0)
+ return ret;
+ else if (s != 0 && *e != s) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Receive the EVENTFD_START state on the eventfd file descriptor.
+ *
+ * On Success return 0. On error, -1 will be returned, and errno will
+ * be set appropriately.
+ */
+int eventfd_recv_start(int efd) {
+ eventfd_t e = EVENTFD_INIT;
+ return eventfd_recv_state(efd, &e, EVENTFD_START);
+}
+
+/*
+ * Receive the EVENTFD_PARENT_SUCCEEDED state on the eventfd file
+ * descriptor.
+ *
+ * On Success return 0. On error, -1 will be returned, and errno will
+ * be set appropriately.
+ */
+int eventfd_parent_succeeded(int efd) {
+ eventfd_t e = EVENTFD_INIT;
+ return eventfd_recv_state(efd, &e, EVENTFD_PARENT_SUCCEEDED);
+}
+
+/*
+ * Receive the EVENTFD_CHILD_SUCCEEDED state on the eventfd file
+ * descriptor.
+ *
+ * On Success return 0. On error, -1 will be returned, and errno will
+ * be set appropriately.
+ */
+int eventfd_child_succeeded(int efd) {
+ eventfd_t e = EVENTFD_INIT;
+ return eventfd_recv_state(efd, &e, EVENTFD_CHILD_SUCCEEDED);
+}
diff --git a/src/shared/eventfd-util.h b/src/shared/eventfd-util.h
new file mode 100644
index 000000000..0120f0409
--- /dev/null
+++ b/src/shared/eventfd-util.h
@@ -0,0 +1,43 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+#pragma once
+
+/***
+ This file is part of systemd.
+
+ Copyright 2014 Djalal Harouni
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <sys/types.h>
+#include <sys/eventfd.h>
+
+enum {
+ EVENTFD_INIT,
+ EVENTFD_START,
+ EVENTFD_PARENT_SUCCEEDED,
+ EVENTFD_PARENT_FAILED,
+ EVENTFD_CHILD_SUCCEEDED,
+ EVENTFD_CHILD_FAILED,
+};
+
+pid_t clone_with_eventfd(int flags, int eventfds[2]);
+
+int eventfd_send_state(int efd, eventfd_t s);
+int eventfd_recv_state(int efd, eventfd_t *e, eventfd_t s);
+
+int eventfd_recv_start(int efd);
+int eventfd_parent_succeeded(int efd);
+int eventfd_child_succeeded(int efd);