From 1da37e58ffb2f8bfa129a550c09bf2d458d7c782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Wed, 16 Sep 2020 17:38:26 +0200 Subject: [PATCH 1/8] core/execute: refactor creation of array with fds to keep during execution We close fds in two phases, first some and then the some more. When passing a list of fds to exclude from closing to the closing function, we would pass some in an array and the rest as separate arguments. For the fds which should be excluded in both closing phases, let's always create the array and put the relevant fds there. This has the advantage that if more fds to exclude in both phases are added later, we don't need to add more positional arguments. The list passed to setup_pam() is not changed. I think we could pass more fds to close there, but I'm leaving that unchanged. The setting of FD_CLOEXEC on an already open fds is dropped. The fd is opened in service_allocate_exec_fd() and there is no reason to suspect that it might have been opened incorrectly. If some rogue code is unsetting our FD_CLOEXEC bits, then it might flip any fd, no reason to single this one out. --- src/core/execute.c | 91 ++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index d21de8c41e..a78954c0c3 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1247,8 +1247,8 @@ static int setup_pam( * termination */ barrier_set_role(&barrier, BARRIER_CHILD); - /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds - * are open here that have been opened by PAM. */ + /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only + * those fds are open here that have been opened by PAM. */ (void) close_many(fds, n_fds); /* Drop privileges - we don't need any to pam_close_session @@ -3407,7 +3407,6 @@ static int close_remaining_fds( const DynamicCreds *dcreds, int user_lookup_fd, int socket_fd, - int exec_fd, const int *fds, size_t n_fds) { size_t n_dont_close = 0; @@ -3424,8 +3423,6 @@ static int close_remaining_fds( if (socket_fd >= 0) dont_close[n_dont_close++] = socket_fd; - if (exec_fd >= 0) - dont_close[n_dont_close++] = exec_fd; if (n_fds > 0) { memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds); n_dont_close += n_fds; @@ -3602,6 +3599,35 @@ bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) { return c->cpu_affinity_from_numa; } +static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) { + int r; + + assert(fds); + assert(n_fds); + assert(*n_fds < fds_size); + assert(ret_fd); + + if (fd < 0) { + *ret_fd = -1; + return 0; + } + + if (fd < 3 + (int) *n_fds) { + /* Let's move the fd up, so that it's outside of the fd range we will use to store + * the fds we pass to the process (or which are closed only during execve). */ + + r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds); + if (r < 0) + return -errno; + + CLOSE_AND_REPLACE(fd, r); + } + + *ret_fd = fds[*n_fds] = fd; + (*n_fds) ++; + return 1; +} + static int exec_child( Unit *unit, const ExecCommand *command, @@ -3619,7 +3645,7 @@ static int exec_child( int *exit_status) { _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL; - int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1; + int r, ngids = 0, exec_fd; _cleanup_free_ gid_t *supplementary_gids = NULL; const char *username = NULL, *groupname = NULL; _cleanup_free_ char *home_buffer = NULL; @@ -3646,7 +3672,8 @@ static int exec_child( gid_t saved_gid = getgid(); uid_t uid = UID_INVALID; gid_t gid = GID_INVALID; - size_t n_fds; + size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */ + n_keep_fds; /* total number of fds not to close */ int secure_bits; _cleanup_free_ gid_t *gids_after_pam = NULL; int ngids_after_pam = 0; @@ -3690,8 +3717,17 @@ static int exec_child( /* In case anything used libc syslog(), close this here, too */ closelog(); - n_fds = n_socket_fds + n_storage_fds; - r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds); + int keep_fds[n_fds + 1]; + memcpy_safe(keep_fds, fds, n_fds * sizeof(int)); + n_keep_fds = n_fds; + + r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m"); + } + + r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds); if (r < 0) { *exit_status = EXIT_FDS; return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m"); @@ -4086,6 +4122,7 @@ static int exec_child( /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits * wins here. (See above.) */ + /* All fds passed in the fds array will be closed in the pam child process. */ r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds); if (r < 0) { *exit_status = EXIT_PAM; @@ -4230,41 +4267,7 @@ static int exec_child( * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd * however if we have it as we want to keep it open until the final execve(). */ - if (params->exec_fd >= 0) { - exec_fd = params->exec_fd; - - if (exec_fd < 3 + (int) n_fds) { - int moved_fd; - - /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the - * process we are about to execute. */ - - moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds); - if (moved_fd < 0) { - *exit_status = EXIT_FDS; - return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m"); - } - - CLOSE_AND_REPLACE(exec_fd, moved_fd); - } else { - /* This fd should be FD_CLOEXEC already, but let's make sure. */ - r = fd_cloexec(exec_fd, true); - if (r < 0) { - *exit_status = EXIT_FDS; - return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m"); - } - } - - fds_with_exec_fd = newa(int, n_fds + 1); - memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int)); - fds_with_exec_fd[n_fds] = exec_fd; - n_fds_with_exec_fd = n_fds + 1; - } else { - fds_with_exec_fd = fds; - n_fds_with_exec_fd = n_fds; - } - - r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd); + r = close_all_fds(keep_fds, n_keep_fds); if (r >= 0) r = shift_fds(fds, n_fds); if (r >= 0) From 5ca9139ace34a2520a75b2be660c5374c4098ed0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Thu, 17 Sep 2020 15:02:47 +0200 Subject: [PATCH 2/8] basic/path-util: let find_executable_full() optionally return an fd --- src/basic/path-util.c | 52 +++++++++++++++++++++++++++++++++------ src/basic/path-util.h | 6 ++--- src/core/execute.c | 2 +- src/test/test-path-util.c | 43 +++++++++++++++++++++++++++++--- 4 files changed, 87 insertions(+), 16 deletions(-) diff --git a/src/basic/path-util.c b/src/basic/path-util.c index a36cf8332c..c14d885d1a 100644 --- a/src/basic/path-util.c +++ b/src/basic/path-util.c @@ -585,22 +585,53 @@ char* path_join_internal(const char *first, ...) { return joined; } -int find_executable_full(const char *name, bool use_path_envvar, char **ret) { +static int check_x_access(const char *path, int *ret_fd) { + if (ret_fd) { + _cleanup_close_ int fd = -1; + int r; + + /* We need to use O_PATH because there may be executables for which we have only exec + * permissions, but not read (usually suid executables). */ + fd = open(path, O_PATH|O_CLOEXEC); + if (fd < 0) + return -errno; + + r = access_fd(fd, X_OK); + if (r < 0) + return r; + + *ret_fd = TAKE_FD(fd); + } else { + /* Let's optimize things a bit by not opening the file if we don't need the fd. */ + if (access(path, X_OK) < 0) + return -errno; + } + + return 0; +} + +int find_executable_full(const char *name, bool use_path_envvar, char **ret_filename, int *ret_fd) { int last_error, r; const char *p = NULL; assert(name); if (is_path(name)) { - if (access(name, X_OK) < 0) - return -errno; + _cleanup_close_ int fd = -1; - if (ret) { - r = path_make_absolute_cwd(name, ret); + r = check_x_access(name, ret_fd ? &fd : NULL); + if (r < 0) + return r; + + if (ret_filename) { + r = path_make_absolute_cwd(name, ret_filename); if (r < 0) return r; } + if (ret_fd) + *ret_fd = TAKE_FD(fd); + return 0; } @@ -613,8 +644,10 @@ int find_executable_full(const char *name, bool use_path_envvar, char **ret) { last_error = -ENOENT; + /* Resolve a single-component name to a full path */ for (;;) { _cleanup_free_ char *j = NULL, *element = NULL; + _cleanup_close_ int fd = -1; r = extract_first_word(&p, &element, ":", EXTRACT_RELAX|EXTRACT_DONT_COALESCE_SEPARATORS); if (r < 0) @@ -629,7 +662,8 @@ int find_executable_full(const char *name, bool use_path_envvar, char **ret) { if (!j) return -ENOMEM; - if (access(j, X_OK) >= 0) { + r = check_x_access(j, ret_fd ? &fd : NULL); + if (r >= 0) { _cleanup_free_ char *with_dash; with_dash = strjoin(j, "/"); @@ -643,8 +677,10 @@ int find_executable_full(const char *name, bool use_path_envvar, char **ret) { /* We can't just `continue` inverting this case, since we need to update last_error. */ if (errno == ENOTDIR) { /* Found it! */ - if (ret) - *ret = path_simplify(TAKE_PTR(j), false); + if (ret_filename) + *ret_filename = path_simplify(TAKE_PTR(j), false); + if (ret_fd) + *ret_fd = TAKE_FD(fd); return 0; } diff --git a/src/basic/path-util.h b/src/basic/path-util.h index bd8c14903e..13e87731a6 100644 --- a/src/basic/path-util.h +++ b/src/basic/path-util.h @@ -88,9 +88,9 @@ int path_strv_make_absolute_cwd(char **l); char** path_strv_resolve(char **l, const char *root); char** path_strv_resolve_uniq(char **l, const char *root); -int find_executable_full(const char *name, bool use_path_envvar, char **ret); -static inline int find_executable(const char *name, char **ret) { - return find_executable_full(name, true, ret); +int find_executable_full(const char *name, bool use_path_envvar, char **ret_filename, int *ret_fd); +static inline int find_executable(const char *name, char **ret_filename) { + return find_executable_full(name, true, ret_filename, NULL); } bool paths_check_timestamp(const char* const* paths, usec_t *paths_ts_usec, bool update); diff --git a/src/core/execute.c b/src/core/execute.c index a78954c0c3..bdd6857fe1 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -4230,7 +4230,7 @@ static int exec_child( * shall execute. */ _cleanup_free_ char *executable = NULL; - r = find_executable_full(command->path, false, &executable); + r = find_executable_full(command->path, false, &executable, NULL); if (r < 0) { if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { log_struct_errno(LOG_INFO, r, diff --git a/src/test/test-path-util.c b/src/test/test-path-util.c index e98c19dd6c..3ee3795b48 100644 --- a/src/test/test-path-util.c +++ b/src/test/test-path-util.c @@ -8,6 +8,7 @@ #include "macro.h" #include "mountpoint-util.h" #include "path-util.h" +#include "process-util.h" #include "rm-rf.h" #include "stat-util.h" #include "string-util.h" @@ -169,12 +170,12 @@ static void test_find_executable_full(void) { log_info("/* %s */", __func__); - assert_se(find_executable_full("sh", true, &p) == 0); + assert_se(find_executable_full("sh", true, &p, NULL) == 0); puts(p); assert_se(streq(basename(p), "sh")); free(p); - assert_se(find_executable_full("sh", false, &p) == 0); + assert_se(find_executable_full("sh", false, &p, NULL) == 0); puts(p); assert_se(streq(basename(p), "sh")); free(p); @@ -186,12 +187,12 @@ static void test_find_executable_full(void) { assert_se(unsetenv("PATH") >= 0); - assert_se(find_executable_full("sh", true, &p) == 0); + assert_se(find_executable_full("sh", true, &p, NULL) == 0); puts(p); assert_se(streq(basename(p), "sh")); free(p); - assert_se(find_executable_full("sh", false, &p) == 0); + assert_se(find_executable_full("sh", false, &p, NULL) == 0); puts(p); assert_se(streq(basename(p), "sh")); free(p); @@ -236,6 +237,39 @@ static void test_find_executable(const char *self) { assert_se(find_executable("/proc/filesystems", &p) == -EACCES); } +static void test_find_executable_exec_one(const char *path) { + _cleanup_free_ char *t = NULL; + _cleanup_close_ int fd = -1; + pid_t pid; + int r; + + r = find_executable_full(path, false, &t, &fd); + + log_info_errno(r, "%s: %s → %s: %d/%m", __func__, path, t ?: "-", fd); + + assert_se(fd > STDERR_FILENO); + assert_se(path_is_absolute(t)); + if (path_is_absolute(path)) + assert_se(streq(t, path)); + + pid = fork(); + assert_se(pid >= 0); + if (pid == 0) { + fexecve(fd, STRV_MAKE(t, "--version"), STRV_MAKE(NULL)); + log_error_errno(errno, "fexecve: %m"); + _exit(EXIT_FAILURE); + } + + assert_se(wait_for_terminate_and_check(t, pid, WAIT_LOG) == 0); +} + +static void test_find_executable_exec(void) { + log_info("/* %s */", __func__); + + test_find_executable_exec_one("touch"); + test_find_executable_exec_one("/bin/touch"); +} + static void test_prefixes(void) { static const char* const values[] = { "/a/b/c/d", @@ -717,6 +751,7 @@ int main(int argc, char **argv) { test_path_equal_root(); test_find_executable_full(); test_find_executable(argv[0]); + test_find_executable_exec(); test_prefixes(); test_path_join(); test_fsck_exists(); From b83d505087494483ce7e53f438ecbd8f3254f0c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Thu, 17 Sep 2020 15:01:26 +0200 Subject: [PATCH 3/8] core: use fexecve() to spawn children We base the smack/selinux setup on the executable. Let's open the file once and use the same fd for that setup and the subsequent execve. --- src/core/execute.c | 21 ++++++++++++++------- src/shared/exec-util.c | 1 - 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index bdd6857fe1..f1d6f87755 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2881,11 +2881,11 @@ static int setup_credentials( #if ENABLE_SMACK static int setup_smack( const ExecContext *context, - const char *executable) { + int executable_fd) { int r; assert(context); - assert(executable); + assert(executable_fd >= 0); if (context->smack_process_label) { r = mac_smack_apply_pid(0, context->smack_process_label); @@ -2896,7 +2896,7 @@ static int setup_smack( else { _cleanup_free_ char *exec_label = NULL; - r = mac_smack_read(executable, SMACK_ATTR_EXEC, &exec_label); + r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label); if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP)) return r; @@ -3717,7 +3717,7 @@ static int exec_child( /* In case anything used libc syslog(), close this here, too */ closelog(); - int keep_fds[n_fds + 1]; + int keep_fds[n_fds + 2]; memcpy_safe(keep_fds, fds, n_fds * sizeof(int)); n_keep_fds = n_fds; @@ -4230,7 +4230,8 @@ static int exec_child( * shall execute. */ _cleanup_free_ char *executable = NULL; - r = find_executable_full(command->path, false, &executable, NULL); + _cleanup_close_ int executable_fd = -1; + r = find_executable_full(command->path, false, &executable, &executable_fd); if (r < 0) { if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { log_struct_errno(LOG_INFO, r, @@ -4253,6 +4254,12 @@ static int exec_child( "EXECUTABLE=%s", command->path); } + r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m"); + } + #if HAVE_SELINUX if (needs_sandboxing && use_selinux && params->selinux_context_net && socket_fd >= 0) { r = mac_selinux_get_child_mls_label(socket_fd, executable, context->selinux_context, &mac_selinux_context_net); @@ -4301,7 +4308,7 @@ static int exec_child( /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the * process. This is the latest place before dropping capabilities. Other MAC context are set later. */ if (use_smack) { - r = setup_smack(context, executable); + r = setup_smack(context, executable_fd); if (r < 0) { *exit_status = EXIT_SMACK_PROCESS_LABEL; return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m"); @@ -4571,7 +4578,7 @@ static int exec_child( } } - execve(executable, final_argv, accum_env); + fexecve(executable_fd, final_argv, accum_env); r = -errno; if (exec_fd >= 0) { diff --git a/src/shared/exec-util.c b/src/shared/exec-util.c index 8fb936dcce..a93c206d9a 100644 --- a/src/shared/exec-util.c +++ b/src/shared/exec-util.c @@ -33,7 +33,6 @@ assert_cc(EAGAIN == EWOULDBLOCK); static int do_spawn(const char *path, char *argv[], int stdout_fd, pid_t *pid) { - pid_t _pid; int r; From a6d9111c67264a8828a563ccc23a24144b879942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Fri, 18 Sep 2020 14:28:08 +0200 Subject: [PATCH 4/8] core/execute: fall back to execve() for scripts fexecve() fails with ENOENT and we need a fallback. Add appropriate test. --- src/core/execute.c | 3 +-- src/shared/exec-util.c | 20 ++++++++++++++++++++ src/shared/exec-util.h | 2 ++ src/test/test-path-util.c | 9 +++++++-- test/meson.build | 2 ++ test/test-path-util/script.sh | 6 ++++++ 6 files changed, 38 insertions(+), 4 deletions(-) create mode 100755 test/test-path-util/script.sh diff --git a/src/core/execute.c b/src/core/execute.c index f1d6f87755..11e172f61b 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -4578,8 +4578,7 @@ static int exec_child( } } - fexecve(executable_fd, final_argv, accum_env); - r = -errno; + r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env); if (exec_fd >= 0) { uint8_t hot = 0; diff --git a/src/shared/exec-util.c b/src/shared/exec-util.c index a93c206d9a..a0bb1567ce 100644 --- a/src/shared/exec-util.c +++ b/src/shared/exec-util.c @@ -443,3 +443,23 @@ ExecCommandFlags exec_command_flags_from_string(const char *s) { else return 1 << idx; } + +int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]) { + fexecve(executable_fd, argv, envp); + if (errno == ENOENT) + /* A script? Let's fall back to execve(). + * + * fexecve(3): "If fd refers to a script (i.e., it is an executable text file that names a + * script interpreter with a first line that begins with the characters #!) and the + * close-on-exec flag has been set for fd, then fexecve() fails with the error ENOENT. This + * error occurs because, by the time the script interpreter is executed, fd has already been + * closed because of the close-on-exec flag. Thus, the close-on-exec flag can't be set on fd + * if it refers to a script." + * + * Unfortunately, if we unset close-on-exec, the script will be executed just fine, but (at + * least in case of bash) the script name, $0, will be shown as /dev/fd/nnn, which breaks + * scripts which make use of $0. Thus, let's fall back to execve() in this case. + */ + execve(executable, argv, envp); + return -errno; +} diff --git a/src/shared/exec-util.h b/src/shared/exec-util.h index 9fe9012516..65556249c1 100644 --- a/src/shared/exec-util.h +++ b/src/shared/exec-util.h @@ -45,3 +45,5 @@ extern const gather_stdout_callback_t gather_environment[_STDOUT_CONSUME_MAX]; const char* exec_command_flags_to_string(ExecCommandFlags i); ExecCommandFlags exec_command_flags_from_string(const char *s); + +int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]); diff --git a/src/test/test-path-util.c b/src/test/test-path-util.c index 3ee3795b48..c05a5d5b4a 100644 --- a/src/test/test-path-util.c +++ b/src/test/test-path-util.c @@ -4,6 +4,7 @@ #include #include "alloc-util.h" +#include "exec-util.h" #include "fd-util.h" #include "macro.h" #include "mountpoint-util.h" @@ -255,8 +256,8 @@ static void test_find_executable_exec_one(const char *path) { pid = fork(); assert_se(pid >= 0); if (pid == 0) { - fexecve(fd, STRV_MAKE(t, "--version"), STRV_MAKE(NULL)); - log_error_errno(errno, "fexecve: %m"); + r = fexecve_or_execve(fd, t, STRV_MAKE(t, "--version"), STRV_MAKE(NULL)); + log_error_errno(r, "[f]execve: %m"); _exit(EXIT_FAILURE); } @@ -268,6 +269,10 @@ static void test_find_executable_exec(void) { test_find_executable_exec_one("touch"); test_find_executable_exec_one("/bin/touch"); + + _cleanup_free_ char *script = NULL; + assert_se(get_testdata_dir("test-path-util/script.sh", &script) >= 0); + test_find_executable_exec_one(script); } static void test_prefixes(void) { diff --git a/test/meson.build b/test/meson.build index 5656abdf72..d9ff25f829 100644 --- a/test/meson.build +++ b/test/meson.build @@ -11,6 +11,8 @@ if install_tests install_dir : testdata_dir) install_subdir('test-path', install_dir : testdata_dir) + install_subdir('test-path-util', + install_dir : testdata_dir) install_subdir('test-umount', install_dir : testdata_dir) install_subdir('test-network-generator-conversion', diff --git a/test/test-path-util/script.sh b/test/test-path-util/script.sh new file mode 100755 index 0000000000..57c93e7476 --- /dev/null +++ b/test/test-path-util/script.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +echo "$0 $@" +test "$(basename $0)" = "script.sh" || exit 1 +test "$1" = "--version" || exit 2 +echo "Life is good" From 8939eeae528ef9b9ad2a21995279b76d382d5c81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Wed, 23 Sep 2020 16:23:30 +0200 Subject: [PATCH 5/8] shared/exec-util: use our own execveat() wrapper instead of fexecve() For scripts, when we call fexecve(), on new kernels glibc calls execveat(), which fails with ENOENT, and then we fall back to execve() which succeeds: [pid 63039] execveat(3, "", ["/home/zbyszek/src/systemd/test/test-path-util/script.sh", "--version"], 0x7ffefa3633f0 /* 0 vars */, AT_EMPTY_PATH) = -1 ENOENT (No such file or directory) [pid 63039] execve("/home/zbyszek/src/systemd/test/test-path-util/script.sh", ["/home/zbyszek/src/systemd/test/test-path-util/script.sh", "--version"], 0x7ffefa3633f0 /* 0 vars */) = 0 But on older kernels glibc (some versions?) implement a fallback which falls into the same trap with bash $0: [pid 13534] execve("/proc/self/fd/3", ["/home/test/systemd/test/test-path-util/script.sh", "--version"], 0x7fff84995870 /* 0 vars */) = 0 We don't want that, so let's call execveat() ourselves. Then we can do the execve() fallback as we want. --- meson.build | 1 + src/basic/missing_syscall.h | 19 +++++++++++++++++++ src/shared/exec-util.c | 7 ++++--- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 04cb63d921..68ead1f849 100644 --- a/meson.build +++ b/meson.build @@ -533,6 +533,7 @@ foreach ident : [ #include #include '''], ['mallinfo', '''#include '''], + ['execveat', '''#include '''], ['close_range', '''#include '''], ] diff --git a/src/basic/missing_syscall.h b/src/basic/missing_syscall.h index 01fec6f2f5..43d08bada7 100644 --- a/src/basic/missing_syscall.h +++ b/src/basic/missing_syscall.h @@ -735,6 +735,25 @@ static inline int missing_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *info) # define rt_sigqueueinfo missing_rt_sigqueueinfo #endif +/* ======================================================================= */ + +#if !HAVE_EXECVEAT +static inline int missing_execveat(int dirfd, const char *pathname, + char *const argv[], char *const envp[], + int flags) { +# if defined __NR_execveat && __NR_execveat >= 0 + return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# undef AT_EMPTY_PATH +# define AT_EMPTY_PATH 0x1000 +# define execveat missing_execveat +#endif + /* ======================================================================= */ #define systemd_NR_close_range systemd_SC_arch_bias(436) diff --git a/src/shared/exec-util.c b/src/shared/exec-util.c index a0bb1567ce..d4ebeea301 100644 --- a/src/shared/exec-util.c +++ b/src/shared/exec-util.c @@ -16,6 +16,7 @@ #include "fileio.h" #include "hashmap.h" #include "macro.h" +#include "missing_syscall.h" #include "process-util.h" #include "rlimit-util.h" #include "serialize.h" @@ -445,9 +446,9 @@ ExecCommandFlags exec_command_flags_from_string(const char *s) { } int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]) { - fexecve(executable_fd, argv, envp); - if (errno == ENOENT) - /* A script? Let's fall back to execve(). + execveat(executable_fd, "", argv, envp, AT_EMPTY_PATH); + if (IN_SET(errno, ENOSYS, ENOENT)) + /* Old kernel or a script? Let's fall back to execve(). * * fexecve(3): "If fd refers to a script (i.e., it is an executable text file that names a * script interpreter with a first line that begins with the characters #!) and the From f98ca3a11d969736e9b5c3b34c90eca722a1a6d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Mon, 5 Oct 2020 21:12:58 +0200 Subject: [PATCH 6/8] test-execute: make sure shell execs the child echo is a built-in, so we were testing execve in our own code, and not in the running child. --- test/test-execute/exec-systemcallfilter-failing.service | 2 +- test/test-execute/exec-systemcallfilter-failing2.service | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test-execute/exec-systemcallfilter-failing.service b/test/test-execute/exec-systemcallfilter-failing.service index 996f859217..c4b0799031 100644 --- a/test/test-execute/exec-systemcallfilter-failing.service +++ b/test/test-execute/exec-systemcallfilter-failing.service @@ -2,7 +2,7 @@ Description=Test for SystemCallFilter [Service] -ExecStart=/bin/sh -c 'echo "This should not be seen"' +ExecStart=/bin/sh -c '/bin/echo "This should not be seen"' Type=oneshot LimitCORE=0 SystemCallFilter=ioperm diff --git a/test/test-execute/exec-systemcallfilter-failing2.service b/test/test-execute/exec-systemcallfilter-failing2.service index c74f42248b..452aaf8273 100644 --- a/test/test-execute/exec-systemcallfilter-failing2.service +++ b/test/test-execute/exec-systemcallfilter-failing2.service @@ -2,7 +2,7 @@ Description=Test for SystemCallFilter [Service] -ExecStart=/bin/sh -c 'echo "This should not be seen"' +ExecStart=/bin/sh -c '/bin/echo "This should not be seen"' Type=oneshot LimitCORE=0 -SystemCallFilter=~write open execve exit_group close mmap munmap fstat DONOTEXIST +SystemCallFilter=~write open execve fexecve execveat exit_group close mmap munmap fstat DONOTEXIST From 3f51bbff559428e4c32640c0c1b7b8516dc16acb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Wed, 14 Oct 2020 17:24:24 +0200 Subject: [PATCH 7/8] shared/exec-util: fall back to execve() also on permission errors --- src/shared/exec-util.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/shared/exec-util.c b/src/shared/exec-util.c index d4ebeea301..e6f0af7dc2 100644 --- a/src/shared/exec-util.c +++ b/src/shared/exec-util.c @@ -11,6 +11,7 @@ #include "conf-files.h" #include "env-file.h" #include "env-util.h" +#include "errno-util.h" #include "exec-util.h" #include "fd-util.h" #include "fileio.h" @@ -447,8 +448,9 @@ ExecCommandFlags exec_command_flags_from_string(const char *s) { int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]) { execveat(executable_fd, "", argv, envp, AT_EMPTY_PATH); - if (IN_SET(errno, ENOSYS, ENOENT)) - /* Old kernel or a script? Let's fall back to execve(). + + if (IN_SET(errno, ENOSYS, ENOENT) || ERRNO_IS_PRIVILEGE(errno)) + /* Old kernel or a script or an overzealous seccomp filter? Let's fall back to execve(). * * fexecve(3): "If fd refers to a script (i.e., it is an executable text file that names a * script interpreter with a first line that begins with the characters #!) and the From ceedbf8185fc7593366679f02d31da63af8c4bd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Fri, 6 Nov 2020 15:01:13 +0100 Subject: [PATCH 8/8] meson: add option for fexecve use There are downsides to using fexecve: when fexecve is used (for normal executables), /proc/pid/status shows Name: 3, which means that ps -C foobar doesn't work. pidof works, because it checks /proc/self/cmdline. /proc/self/exe also shows the correct link, but requires privileges to read. /proc/self/comm also shows "3". I think this can be considered a kernel deficiency: when O_CLOEXEC is used, this "3" is completely meaningless. It could be any number. The kernel should use argv[0] instead, which at least has *some* meaning. I think the approach with fexecve/execveat is instersting, so let's provide it as opt-in. --- meson.build | 2 ++ meson_options.txt | 4 +++- src/shared/exec-util.c | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/meson.build b/meson.build index 68ead1f849..a800959ba9 100644 --- a/meson.build +++ b/meson.build @@ -215,6 +215,7 @@ conf.set_quoted('SYSTEM_SYSVRCND_PATH', sysvrcnd_path) conf.set_quoted('RC_LOCAL_PATH', get_option('rc-local')) conf.set('ANSI_OK_COLOR', 'ANSI_' + get_option('ok-color').underscorify().to_upper()) +conf.set10('ENABLE_FEXECVE', get_option('fexecve')) conf.set_quoted('USER_CONFIG_UNIT_DIR', join_paths(pkgsysconfdir, 'user')) conf.set_quoted('USER_DATA_UNIT_DIR', userunitdir) @@ -3787,6 +3788,7 @@ foreach tuple : [ ['link-timesyncd-shared', get_option('link-timesyncd-shared')], ['kernel-install', get_option('kernel-install')], ['systemd-analyze', conf.get('ENABLE_ANALYZE') == 1], + ['fexecve'], ] if tuple.length() >= 2 diff --git a/meson_options.txt b/meson_options.txt index d5ce647ae6..9d14eca7f9 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -371,13 +371,15 @@ option('fuzz-tests', type : 'boolean', value : 'false', option('install-tests', type : 'boolean', value : 'false', description : 'install test executables') -option('ok-color', type: 'combo', +option('ok-color', type : 'combo', choices : ['black', 'red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white', 'highlight-black', 'highlight-red', 'highlight-green', 'highlight-yellow', 'highlight-blue', 'highlight-magenta', 'highlight-cyan', 'highlight-white'], value : 'green', description: 'color of the "OK" status message') +option('fexecve', type : 'boolean', value : 'false', + description : 'use fexecve() to spawn children') option('oss-fuzz', type : 'boolean', value : 'false', description : 'build against oss-fuzz') diff --git a/src/shared/exec-util.c b/src/shared/exec-util.c index e6f0af7dc2..b7303f0aef 100644 --- a/src/shared/exec-util.c +++ b/src/shared/exec-util.c @@ -447,6 +447,7 @@ ExecCommandFlags exec_command_flags_from_string(const char *s) { } int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]) { +#if ENABLE_FEXECVE execveat(executable_fd, "", argv, envp, AT_EMPTY_PATH); if (IN_SET(errno, ENOSYS, ENOENT) || ERRNO_IS_PRIVILEGE(errno)) @@ -463,6 +464,7 @@ int fexecve_or_execve(int executable_fd, const char *executable, char *const arg * least in case of bash) the script name, $0, will be shown as /dev/fd/nnn, which breaks * scripts which make use of $0. Thus, let's fall back to execve() in this case. */ +#endif execve(executable, argv, envp); return -errno; }