diff --git a/meson.build b/meson.build index 07ad9c4d5d..75eb6fc1dc 100644 --- a/meson.build +++ b/meson.build @@ -216,6 +216,7 @@ conf.set_quoted('SYSTEM_SYSVRCND_PATH', sysvrcnd_path) conf.set_quoted('RC_LOCAL_PATH', get_option('rc-local')) conf.set('ANSI_OK_COLOR', 'ANSI_' + get_option('ok-color').underscorify().to_upper()) +conf.set10('ENABLE_FEXECVE', get_option('fexecve')) conf.set_quoted('USER_CONFIG_UNIT_DIR', join_paths(pkgsysconfdir, 'user')) conf.set_quoted('USER_DATA_UNIT_DIR', userunitdir) @@ -534,6 +535,7 @@ foreach ident : [ #include #include '''], ['mallinfo', '''#include '''], + ['execveat', '''#include '''], ['close_range', '''#include '''], ] @@ -3832,6 +3834,7 @@ foreach tuple : [ ['link-timesyncd-shared', get_option('link-timesyncd-shared')], ['kernel-install', get_option('kernel-install')], ['systemd-analyze', conf.get('ENABLE_ANALYZE') == 1], + ['fexecve'], ] if tuple.length() >= 2 diff --git a/meson_options.txt b/meson_options.txt index 2435ccebd4..b50b0e9224 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -375,13 +375,15 @@ option('fuzz-tests', type : 'boolean', value : 'false', option('install-tests', type : 'boolean', value : 'false', description : 'install test executables') -option('ok-color', type: 'combo', +option('ok-color', type : 'combo', choices : ['black', 'red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white', 'highlight-black', 'highlight-red', 'highlight-green', 'highlight-yellow', 'highlight-blue', 'highlight-magenta', 'highlight-cyan', 'highlight-white'], value : 'green', description: 'color of the "OK" status message') +option('fexecve', type : 'boolean', value : 'false', + description : 'use fexecve() to spawn children') option('oss-fuzz', type : 'boolean', value : 'false', description : 'build against oss-fuzz') diff --git a/src/basic/missing_syscall.h b/src/basic/missing_syscall.h index 6f2a3589c0..7e6dd0cb52 100644 --- a/src/basic/missing_syscall.h +++ b/src/basic/missing_syscall.h @@ -749,6 +749,25 @@ static inline int missing_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *info) # define rt_sigqueueinfo missing_rt_sigqueueinfo #endif +/* ======================================================================= */ + +#if !HAVE_EXECVEAT +static inline int missing_execveat(int dirfd, const char *pathname, + char *const argv[], char *const envp[], + int flags) { +# if defined __NR_execveat && __NR_execveat >= 0 + return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags); +# else + errno = ENOSYS; + return -1; +# endif +} + +# undef AT_EMPTY_PATH +# define AT_EMPTY_PATH 0x1000 +# define execveat missing_execveat +#endif + /* ======================================================================= */ #define systemd_NR_close_range systemd_SC_arch_bias(436) diff --git a/src/basic/path-util.c b/src/basic/path-util.c index 794599a172..dae8b7341a 100644 --- a/src/basic/path-util.c +++ b/src/basic/path-util.c @@ -585,22 +585,53 @@ char* path_join_internal(const char *first, ...) { return joined; } -int find_executable_full(const char *name, bool use_path_envvar, char **ret) { +static int check_x_access(const char *path, int *ret_fd) { + if (ret_fd) { + _cleanup_close_ int fd = -1; + int r; + + /* We need to use O_PATH because there may be executables for which we have only exec + * permissions, but not read (usually suid executables). */ + fd = open(path, O_PATH|O_CLOEXEC); + if (fd < 0) + return -errno; + + r = access_fd(fd, X_OK); + if (r < 0) + return r; + + *ret_fd = TAKE_FD(fd); + } else { + /* Let's optimize things a bit by not opening the file if we don't need the fd. */ + if (access(path, X_OK) < 0) + return -errno; + } + + return 0; +} + +int find_executable_full(const char *name, bool use_path_envvar, char **ret_filename, int *ret_fd) { int last_error, r; const char *p = NULL; assert(name); if (is_path(name)) { - if (access(name, X_OK) < 0) - return -errno; + _cleanup_close_ int fd = -1; - if (ret) { - r = path_make_absolute_cwd(name, ret); + r = check_x_access(name, ret_fd ? &fd : NULL); + if (r < 0) + return r; + + if (ret_filename) { + r = path_make_absolute_cwd(name, ret_filename); if (r < 0) return r; } + if (ret_fd) + *ret_fd = TAKE_FD(fd); + return 0; } @@ -613,8 +644,10 @@ int find_executable_full(const char *name, bool use_path_envvar, char **ret) { last_error = -ENOENT; + /* Resolve a single-component name to a full path */ for (;;) { _cleanup_free_ char *j = NULL, *element = NULL; + _cleanup_close_ int fd = -1; r = extract_first_word(&p, &element, ":", EXTRACT_RELAX|EXTRACT_DONT_COALESCE_SEPARATORS); if (r < 0) @@ -629,7 +662,8 @@ int find_executable_full(const char *name, bool use_path_envvar, char **ret) { if (!j) return -ENOMEM; - if (access(j, X_OK) >= 0) { + r = check_x_access(j, ret_fd ? &fd : NULL); + if (r >= 0) { _cleanup_free_ char *with_dash; with_dash = strjoin(j, "/"); @@ -643,8 +677,10 @@ int find_executable_full(const char *name, bool use_path_envvar, char **ret) { /* We can't just `continue` inverting this case, since we need to update last_error. */ if (errno == ENOTDIR) { /* Found it! */ - if (ret) - *ret = path_simplify(TAKE_PTR(j), false); + if (ret_filename) + *ret_filename = path_simplify(TAKE_PTR(j), false); + if (ret_fd) + *ret_fd = TAKE_FD(fd); return 0; } diff --git a/src/basic/path-util.h b/src/basic/path-util.h index d613709f0b..e7a26c9a84 100644 --- a/src/basic/path-util.h +++ b/src/basic/path-util.h @@ -88,9 +88,9 @@ int path_strv_make_absolute_cwd(char **l); char** path_strv_resolve(char **l, const char *root); char** path_strv_resolve_uniq(char **l, const char *root); -int find_executable_full(const char *name, bool use_path_envvar, char **ret); -static inline int find_executable(const char *name, char **ret) { - return find_executable_full(name, true, ret); +int find_executable_full(const char *name, bool use_path_envvar, char **ret_filename, int *ret_fd); +static inline int find_executable(const char *name, char **ret_filename) { + return find_executable_full(name, true, ret_filename, NULL); } bool paths_check_timestamp(const char* const* paths, usec_t *paths_ts_usec, bool update); diff --git a/src/core/execute.c b/src/core/execute.c index 56793c5a43..8f901fa715 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1248,8 +1248,8 @@ static int setup_pam( * termination */ barrier_set_role(&barrier, BARRIER_CHILD); - /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds - * are open here that have been opened by PAM. */ + /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only + * those fds are open here that have been opened by PAM. */ (void) close_many(fds, n_fds); /* Drop privileges - we don't need any to pam_close_session @@ -2893,11 +2893,11 @@ static int setup_credentials( #if ENABLE_SMACK static int setup_smack( const ExecContext *context, - const char *executable) { + int executable_fd) { int r; assert(context); - assert(executable); + assert(executable_fd >= 0); if (context->smack_process_label) { r = mac_smack_apply_pid(0, context->smack_process_label); @@ -2908,7 +2908,7 @@ static int setup_smack( else { _cleanup_free_ char *exec_label = NULL; - r = mac_smack_read(executable, SMACK_ATTR_EXEC, &exec_label); + r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label); if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP)) return r; @@ -3419,7 +3419,6 @@ static int close_remaining_fds( const DynamicCreds *dcreds, int user_lookup_fd, int socket_fd, - int exec_fd, const int *fds, size_t n_fds) { size_t n_dont_close = 0; @@ -3436,8 +3435,6 @@ static int close_remaining_fds( if (socket_fd >= 0) dont_close[n_dont_close++] = socket_fd; - if (exec_fd >= 0) - dont_close[n_dont_close++] = exec_fd; if (n_fds > 0) { memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds); n_dont_close += n_fds; @@ -3614,6 +3611,35 @@ bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) { return c->cpu_affinity_from_numa; } +static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) { + int r; + + assert(fds); + assert(n_fds); + assert(*n_fds < fds_size); + assert(ret_fd); + + if (fd < 0) { + *ret_fd = -1; + return 0; + } + + if (fd < 3 + (int) *n_fds) { + /* Let's move the fd up, so that it's outside of the fd range we will use to store + * the fds we pass to the process (or which are closed only during execve). */ + + r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds); + if (r < 0) + return -errno; + + CLOSE_AND_REPLACE(fd, r); + } + + *ret_fd = fds[*n_fds] = fd; + (*n_fds) ++; + return 1; +} + static int exec_child( Unit *unit, const ExecCommand *command, @@ -3631,7 +3657,7 @@ static int exec_child( int *exit_status) { _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL; - int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1; + int r, ngids = 0, exec_fd; _cleanup_free_ gid_t *supplementary_gids = NULL; const char *username = NULL, *groupname = NULL; _cleanup_free_ char *home_buffer = NULL; @@ -3658,7 +3684,8 @@ static int exec_child( gid_t saved_gid = getgid(); uid_t uid = UID_INVALID; gid_t gid = GID_INVALID; - size_t n_fds; + size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */ + n_keep_fds; /* total number of fds not to close */ int secure_bits; _cleanup_free_ gid_t *gids_after_pam = NULL; int ngids_after_pam = 0; @@ -3702,8 +3729,17 @@ static int exec_child( /* In case anything used libc syslog(), close this here, too */ closelog(); - n_fds = n_socket_fds + n_storage_fds; - r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds); + int keep_fds[n_fds + 2]; + memcpy_safe(keep_fds, fds, n_fds * sizeof(int)); + n_keep_fds = n_fds; + + r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m"); + } + + r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds); if (r < 0) { *exit_status = EXIT_FDS; return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m"); @@ -4095,6 +4131,7 @@ static int exec_child( /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits * wins here. (See above.) */ + /* All fds passed in the fds array will be closed in the pam child process. */ r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds); if (r < 0) { *exit_status = EXIT_PAM; @@ -4202,7 +4239,8 @@ static int exec_child( * shall execute. */ _cleanup_free_ char *executable = NULL; - r = find_executable_full(command->path, false, &executable); + _cleanup_close_ int executable_fd = -1; + r = find_executable_full(command->path, false, &executable, &executable_fd); if (r < 0) { if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { log_struct_errno(LOG_INFO, r, @@ -4225,6 +4263,12 @@ static int exec_child( "EXECUTABLE=%s", command->path); } + r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m"); + } + #if HAVE_SELINUX if (needs_sandboxing && use_selinux && params->selinux_context_net && socket_fd >= 0) { r = mac_selinux_get_child_mls_label(socket_fd, executable, context->selinux_context, &mac_selinux_context_net); @@ -4239,41 +4283,7 @@ static int exec_child( * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd * however if we have it as we want to keep it open until the final execve(). */ - if (params->exec_fd >= 0) { - exec_fd = params->exec_fd; - - if (exec_fd < 3 + (int) n_fds) { - int moved_fd; - - /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the - * process we are about to execute. */ - - moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds); - if (moved_fd < 0) { - *exit_status = EXIT_FDS; - return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m"); - } - - CLOSE_AND_REPLACE(exec_fd, moved_fd); - } else { - /* This fd should be FD_CLOEXEC already, but let's make sure. */ - r = fd_cloexec(exec_fd, true); - if (r < 0) { - *exit_status = EXIT_FDS; - return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m"); - } - } - - fds_with_exec_fd = newa(int, n_fds + 1); - memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int)); - fds_with_exec_fd[n_fds] = exec_fd; - n_fds_with_exec_fd = n_fds + 1; - } else { - fds_with_exec_fd = fds; - n_fds_with_exec_fd = n_fds; - } - - r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd); + r = close_all_fds(keep_fds, n_keep_fds); if (r >= 0) r = shift_fds(fds, n_fds); if (r >= 0) @@ -4307,7 +4317,7 @@ static int exec_child( /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the * process. This is the latest place before dropping capabilities. Other MAC context are set later. */ if (use_smack) { - r = setup_smack(context, executable); + r = setup_smack(context, executable_fd); if (r < 0) { *exit_status = EXIT_SMACK_PROCESS_LABEL; return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m"); @@ -4577,8 +4587,7 @@ static int exec_child( } } - execve(executable, final_argv, accum_env); - r = -errno; + r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env); if (exec_fd >= 0) { uint8_t hot = 0; diff --git a/src/shared/exec-util.c b/src/shared/exec-util.c index 61ee3b18d5..bdea60ca02 100644 --- a/src/shared/exec-util.c +++ b/src/shared/exec-util.c @@ -11,11 +11,13 @@ #include "conf-files.h" #include "env-file.h" #include "env-util.h" +#include "errno-util.h" #include "exec-util.h" #include "fd-util.h" #include "fileio.h" #include "hashmap.h" #include "macro.h" +#include "missing_syscall.h" #include "process-util.h" #include "rlimit-util.h" #include "serialize.h" @@ -33,7 +35,6 @@ assert_cc(EAGAIN == EWOULDBLOCK); static int do_spawn(const char *path, char *argv[], int stdout_fd, pid_t *pid) { - pid_t _pid; int r; @@ -444,3 +445,26 @@ ExecCommandFlags exec_command_flags_from_string(const char *s) { else return 1 << idx; } + +int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]) { +#if ENABLE_FEXECVE + execveat(executable_fd, "", argv, envp, AT_EMPTY_PATH); + + if (IN_SET(errno, ENOSYS, ENOENT) || ERRNO_IS_PRIVILEGE(errno)) + /* Old kernel or a script or an overzealous seccomp filter? Let's fall back to execve(). + * + * fexecve(3): "If fd refers to a script (i.e., it is an executable text file that names a + * script interpreter with a first line that begins with the characters #!) and the + * close-on-exec flag has been set for fd, then fexecve() fails with the error ENOENT. This + * error occurs because, by the time the script interpreter is executed, fd has already been + * closed because of the close-on-exec flag. Thus, the close-on-exec flag can't be set on fd + * if it refers to a script." + * + * Unfortunately, if we unset close-on-exec, the script will be executed just fine, but (at + * least in case of bash) the script name, $0, will be shown as /dev/fd/nnn, which breaks + * scripts which make use of $0. Thus, let's fall back to execve() in this case. + */ +#endif + execve(executable, argv, envp); + return -errno; +} diff --git a/src/shared/exec-util.h b/src/shared/exec-util.h index a69d57c7ae..df6214e4c3 100644 --- a/src/shared/exec-util.h +++ b/src/shared/exec-util.h @@ -45,3 +45,5 @@ extern const gather_stdout_callback_t gather_environment[_STDOUT_CONSUME_MAX]; const char* exec_command_flags_to_string(ExecCommandFlags i); ExecCommandFlags exec_command_flags_from_string(const char *s); + +int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]); diff --git a/src/test/test-path-util.c b/src/test/test-path-util.c index f4f8d0550b..874bab8f94 100644 --- a/src/test/test-path-util.c +++ b/src/test/test-path-util.c @@ -4,10 +4,12 @@ #include #include "alloc-util.h" +#include "exec-util.h" #include "fd-util.h" #include "macro.h" #include "mountpoint-util.h" #include "path-util.h" +#include "process-util.h" #include "rm-rf.h" #include "stat-util.h" #include "string-util.h" @@ -169,12 +171,12 @@ static void test_find_executable_full(void) { log_info("/* %s */", __func__); - assert_se(find_executable_full("sh", true, &p) == 0); + assert_se(find_executable_full("sh", true, &p, NULL) == 0); puts(p); assert_se(streq(basename(p), "sh")); free(p); - assert_se(find_executable_full("sh", false, &p) == 0); + assert_se(find_executable_full("sh", false, &p, NULL) == 0); puts(p); assert_se(streq(basename(p), "sh")); free(p); @@ -186,12 +188,12 @@ static void test_find_executable_full(void) { assert_se(unsetenv("PATH") == 0); - assert_se(find_executable_full("sh", true, &p) == 0); + assert_se(find_executable_full("sh", true, &p, NULL) == 0); puts(p); assert_se(streq(basename(p), "sh")); free(p); - assert_se(find_executable_full("sh", false, &p) == 0); + assert_se(find_executable_full("sh", false, &p, NULL) == 0); puts(p); assert_se(streq(basename(p), "sh")); free(p); @@ -236,6 +238,43 @@ static void test_find_executable(const char *self) { assert_se(find_executable("/proc/filesystems", &p) == -EACCES); } +static void test_find_executable_exec_one(const char *path) { + _cleanup_free_ char *t = NULL; + _cleanup_close_ int fd = -1; + pid_t pid; + int r; + + r = find_executable_full(path, false, &t, &fd); + + log_info_errno(r, "%s: %s → %s: %d/%m", __func__, path, t ?: "-", fd); + + assert_se(fd > STDERR_FILENO); + assert_se(path_is_absolute(t)); + if (path_is_absolute(path)) + assert_se(streq(t, path)); + + pid = fork(); + assert_se(pid >= 0); + if (pid == 0) { + r = fexecve_or_execve(fd, t, STRV_MAKE(t, "--version"), STRV_MAKE(NULL)); + log_error_errno(r, "[f]execve: %m"); + _exit(EXIT_FAILURE); + } + + assert_se(wait_for_terminate_and_check(t, pid, WAIT_LOG) == 0); +} + +static void test_find_executable_exec(void) { + log_info("/* %s */", __func__); + + test_find_executable_exec_one("touch"); + test_find_executable_exec_one("/bin/touch"); + + _cleanup_free_ char *script = NULL; + assert_se(get_testdata_dir("test-path-util/script.sh", &script) >= 0); + test_find_executable_exec_one(script); +} + static void test_prefixes(void) { static const char* const values[] = { "/a/b/c/d", @@ -717,6 +756,7 @@ int main(int argc, char **argv) { test_path_equal_root(); test_find_executable_full(); test_find_executable(argv[0]); + test_find_executable_exec(); test_prefixes(); test_path_join(); test_fsck_exists(); diff --git a/test/meson.build b/test/meson.build index cc461febf1..799d74c7ae 100644 --- a/test/meson.build +++ b/test/meson.build @@ -11,6 +11,8 @@ if install_tests install_dir : testdata_dir) install_subdir('test-path', install_dir : testdata_dir) + install_subdir('test-path-util', + install_dir : testdata_dir) install_subdir('test-umount', install_dir : testdata_dir) install_subdir('test-network-generator-conversion', diff --git a/test/test-execute/exec-systemcallfilter-failing.service b/test/test-execute/exec-systemcallfilter-failing.service index 996f859217..c4b0799031 100644 --- a/test/test-execute/exec-systemcallfilter-failing.service +++ b/test/test-execute/exec-systemcallfilter-failing.service @@ -2,7 +2,7 @@ Description=Test for SystemCallFilter [Service] -ExecStart=/bin/sh -c 'echo "This should not be seen"' +ExecStart=/bin/sh -c '/bin/echo "This should not be seen"' Type=oneshot LimitCORE=0 SystemCallFilter=ioperm diff --git a/test/test-execute/exec-systemcallfilter-failing2.service b/test/test-execute/exec-systemcallfilter-failing2.service index c74f42248b..452aaf8273 100644 --- a/test/test-execute/exec-systemcallfilter-failing2.service +++ b/test/test-execute/exec-systemcallfilter-failing2.service @@ -2,7 +2,7 @@ Description=Test for SystemCallFilter [Service] -ExecStart=/bin/sh -c 'echo "This should not be seen"' +ExecStart=/bin/sh -c '/bin/echo "This should not be seen"' Type=oneshot LimitCORE=0 -SystemCallFilter=~write open execve exit_group close mmap munmap fstat DONOTEXIST +SystemCallFilter=~write open execve fexecve execveat exit_group close mmap munmap fstat DONOTEXIST diff --git a/test/test-path-util/script.sh b/test/test-path-util/script.sh new file mode 100755 index 0000000000..57c93e7476 --- /dev/null +++ b/test/test-path-util/script.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +echo "$0 $@" +test "$(basename $0)" = "script.sh" || exit 1 +test "$1" = "--version" || exit 2 +echo "Life is good"