diff --git a/docs/TRANSIENT-SETTINGS.md b/docs/TRANSIENT-SETTINGS.md index 19944d08b8..2c0aea07da 100644 --- a/docs/TRANSIENT-SETTINGS.md +++ b/docs/TRANSIENT-SETTINGS.md @@ -151,6 +151,8 @@ All execution-related settings are available for transient units. ✓ TimerSlackNSec= ✓ NoNewPrivileges= ✓ KeyringMode= +✓ ProtectProc= +✓ ProcSubset= ✓ SystemCallFilter= ✓ SystemCallArchitectures= ✓ SystemCallErrorNumber= diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 17d128c1b1..c96c654ff0 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -47,6 +47,8 @@ static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_input, exec_input, ExecInp static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long); @@ -1016,6 +1018,8 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("TemporaryFileSystem", "a(ss)", property_get_temporary_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectProc", "s", property_get_protect_proc, offsetof(ExecContext, protect_proc), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1354,6 +1358,8 @@ static BUS_DEFINE_SET_TRANSIENT_PARSE(utmp_mode, ExecUtmpMode, exec_utmp_mode_fr static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_system, ProtectSystem, protect_system_from_string); static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_from_string); static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string); static BUS_DEFINE_SET_TRANSIENT_PARSE(preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string); static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality); static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check); @@ -1706,6 +1712,12 @@ int bus_exec_context_set_transient_property( if (streq(name, "KeyringMode")) return bus_set_transient_keyring_mode(u, name, &c->keyring_mode, message, flags, error); + if (streq(name, "ProtectProc")) + return bus_set_transient_protect_proc(u, name, &c->protect_proc, message, flags, error); + + if (streq(name, "ProcSubset")) + return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error); + if (streq(name, "RuntimeDirectoryPreserve")) return bus_set_transient_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error); diff --git a/src/core/execute.c b/src/core/execute.c index c3a87197f7..d5107288a1 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1948,7 +1948,9 @@ static bool exec_needs_mount_namespace( context->protect_kernel_tunables || context->protect_kernel_modules || context->protect_kernel_logs || - context->protect_control_groups) + context->protect_control_groups || + context->protect_proc != PROTECT_PROC_DEFAULT || + context->proc_subset != PROC_SUBSET_ALL) return true; if (context->root_directory) { @@ -2652,6 +2654,8 @@ static int apply_mount_namespace( .private_mounts = context->private_mounts, .protect_home = context->protect_home, .protect_system = context->protect_system, + .protect_proc = context->protect_proc, + .proc_subset = context->proc_subset, }; } else if (!context->dynamic_user && root_dir) /* @@ -4601,7 +4605,9 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { "%sRestrictRealtime: %s\n" "%sRestrictSUIDSGID: %s\n" "%sKeyringMode: %s\n" - "%sProtectHostname: %s\n", + "%sProtectHostname: %s\n" + "%sProtectProc: %s\n" + "%sProcSubset: %s\n", prefix, c->umask, prefix, c->working_directory ? c->working_directory : "/", prefix, c->root_directory ? c->root_directory : "/", @@ -4623,7 +4629,9 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { prefix, yes_no(c->restrict_realtime), prefix, yes_no(c->restrict_suid_sgid), prefix, exec_keyring_mode_to_string(c->keyring_mode), - prefix, yes_no(c->protect_hostname)); + prefix, yes_no(c->protect_hostname), + prefix, protect_proc_to_string(c->protect_proc), + prefix, proc_subset_to_string(c->proc_subset)); if (c->root_image) fprintf(f, "%sRootImage: %s\n", prefix, c->root_image); diff --git a/src/core/execute.h b/src/core/execute.h index 631279038d..1ea7e51fd7 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -260,6 +260,9 @@ struct ExecContext { char *log_namespace; + ProtectProc protect_proc; /* hidepid= */ + ProcSubset proc_subset; /* subset= */ + bool private_tmp; bool private_network; bool private_devices; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index a191de62af..7d5000c51f 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -73,6 +73,8 @@ $1.AmbientCapabilities, config_parse_capability_set, 0, $1.TimerSlackNSec, config_parse_nsec, 0, offsetof($1, exec_context.timer_slack_nsec) $1.NoNewPrivileges, config_parse_bool, 0, offsetof($1, exec_context.no_new_privileges) $1.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof($1, exec_context.keyring_mode) +$1.ProtectProc, config_parse_protect_proc, 0, offsetof($1, exec_context.protect_proc) +$1.ProcSubset, config_parse_proc_subset, 0, offsetof($1, exec_context.proc_subset) m4_ifdef(`HAVE_SECCOMP', `$1.SystemCallFilter, config_parse_syscall_filter, 0, offsetof($1, exec_context) $1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 75fed001a2..df93fbb28f 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -118,6 +118,8 @@ DEFINE_CONFIG_PARSE(config_parse_exec_secure_bits, secure_bits_from_string, "Fai DEFINE_CONFIG_PARSE_ENUM(config_parse_collect_mode, collect_mode, CollectMode, "Failed to parse garbage collection mode"); DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGroupDevicePolicy, "Failed to parse device policy"); DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode, "Failed to parse keyring mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc, "Failed to parse /proc/ protection mode"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset, "Failed to parse /proc/ subset mode"); DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode, "Failed to parse utmp mode"); DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode, "Failed to parse job mode"); DEFINE_CONFIG_PARSE_ENUM(config_parse_notify_access, notify_access, NotifyAccess, "Failed to parse notify access specifier"); diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index cee5717d0f..ae134610b1 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -108,6 +108,8 @@ CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat); CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces); CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths); CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc); +CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset); CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec); CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec); CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields); diff --git a/src/core/namespace.c b/src/core/namespace.c index 2e13b10d9c..1f78d66a34 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -97,7 +97,7 @@ static const MountEntry protect_kernel_tunables_table[] = { { "/proc/latency_stats", READONLY, true }, { "/proc/mtrr", READONLY, true }, { "/proc/scsi", READONLY, true }, - { "/proc/sys", READONLY, false }, + { "/proc/sys", READONLY, true }, { "/proc/sysrq-trigger", READONLY, true }, { "/proc/timer_stats", READONLY, true }, { "/sys", READONLY, false }, @@ -863,22 +863,53 @@ static int mount_sysfs(const MountEntry *m) { return 1; } -static int mount_procfs(const MountEntry *m) { - int r; +static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) { + const char *entry_path; assert(m); + assert(ns_info); - (void) mkdir_p_label(mount_entry_path(m), 0755); + entry_path = mount_entry_path(m); - r = path_is_mount_point(mount_entry_path(m), NULL, 0); - if (r < 0) - return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m"); - if (r > 0) /* make this a NOP if /proc is already a mount point */ - return 0; + /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in + * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by + * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything + * mounted on /proc/ first. */ - /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */ - if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0) - return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); + (void) mkdir_p_label(entry_path, 0755); + (void) umount_recursive(entry_path, 0); + + if (ns_info->protect_proc != PROTECT_PROC_DEFAULT || + ns_info->proc_subset != PROC_SUBSET_ALL) { + _cleanup_free_ char *opts = NULL; + + /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it + * pretended to be per-instance but actually was per-namespace), hence let's make use of it + * if requested. To make sure this logic succeeds only on kernels where hidepid= is + * per-instance, we'll exclusively use the textual value for hidepid=, since support was + * added in the same commit: if it's supported it is thus also per-instance. */ + + opts = strjoin("hidepid=", + ns_info->protect_proc == PROTECT_PROC_DEFAULT ? "off" : + protect_proc_to_string(ns_info->protect_proc), + ns_info->proc_subset == PROC_SUBSET_PID ? ",subset=pid" : ""); + if (!opts) + return -ENOMEM; + + if (mount("proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts) < 0) { + if (errno != EINVAL) + return log_debug_errno(errno, "Failed to mount %s (options=%s): %m", mount_entry_path(m), opts); + + /* If this failed with EINVAL then this likely means the textual hidepid= stuff is + * not supported by the kernel, and thus the per-instance hidepid= neither, which + * means we really don't want to use it, since it would affect our host's /proc + * mount. Hence let's gracefully fallback to a classic, unrestricted version. */ + } else + return 1; + } + + if (mount("proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0) + return log_debug_errno(errno, "Failed to mount %s (no options): %m", mount_entry_path(m)); return 1; } @@ -997,7 +1028,8 @@ static int follow_symlink( static int apply_mount( const char *root_directory, - MountEntry *m) { + MountEntry *m, + const NamespaceInfo *ns_info) { _cleanup_free_ char *inaccessible = NULL; bool rbind = true, make = false; @@ -1005,6 +1037,7 @@ static int apply_mount( int r; assert(m); + assert(ns_info); log_debug("Applying namespace mount on %s", mount_entry_path(m)); @@ -1109,7 +1142,7 @@ static int apply_mount( return mount_sysfs(m); case PROCFS: - return mount_procfs(m); + return mount_procfs(m, ns_info); case MOUNT_IMAGES: return mount_images(m); @@ -1221,7 +1254,9 @@ static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) { return ns_info->mount_apivfs || ns_info->protect_control_groups || - ns_info->protect_kernel_tunables; + ns_info->protect_kernel_tunables || + ns_info->protect_proc != PROTECT_PROC_DEFAULT || + ns_info->proc_subset != PROC_SUBSET_ALL; } static size_t namespace_calculate_mounts( @@ -1717,7 +1752,7 @@ int setup_namespace( break; } - r = apply_mount(root, m); + r = apply_mount(root, m, ns_info); if (r < 0) { if (error_path && mount_entry_path(m)) *error_path = strdup(mount_entry_path(m)); @@ -2237,3 +2272,19 @@ static const char* const namespace_type_table[] = { }; DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType); + +static const char* const protect_proc_table[_PROTECT_PROC_MAX] = { + [PROTECT_PROC_DEFAULT] = "default", + [PROTECT_PROC_NOACCESS] = "noaccess", + [PROTECT_PROC_INVISIBLE] = "invisible", + [PROTECT_PROC_PTRACEABLE] = "ptraceable", +}; + +DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc); + +static const char* const proc_subset_table[_PROC_SUBSET_MAX] = { + [PROC_SUBSET_ALL] = "all", + [PROC_SUBSET_PID] = "pid", +}; + +DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset); diff --git a/src/core/namespace.h b/src/core/namespace.h index ec1ab4e2a7..e682eae794 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -47,6 +47,22 @@ typedef enum ProtectSystem { _PROTECT_SYSTEM_INVALID = -1 } ProtectSystem; +typedef enum ProtectProc { + PROTECT_PROC_DEFAULT, + PROTECT_PROC_NOACCESS, /* hidepid=noaccess */ + PROTECT_PROC_INVISIBLE, /* hidepid=invisible */ + PROTECT_PROC_PTRACEABLE, /* hidepid=ptraceable */ + _PROTECT_PROC_MAX, + _PROTECT_PROC_INVALID = -1, +} ProtectProc; + +typedef enum ProcSubset { + PROC_SUBSET_ALL, + PROC_SUBSET_PID, /* subset=pid */ + _PROC_SUBSET_MAX, + _PROC_SUBSET_INVALID = -1, +} ProcSubset; + struct NamespaceInfo { bool ignore_protect_paths:1; bool private_dev:1; @@ -59,6 +75,8 @@ struct NamespaceInfo { bool protect_hostname:1; ProtectHome protect_home; ProtectSystem protect_system; + ProtectProc protect_proc; + ProcSubset proc_subset; }; struct BindMount { @@ -135,6 +153,12 @@ ProtectHome protect_home_from_string(const char *s) _pure_; const char* protect_system_to_string(ProtectSystem p) _const_; ProtectSystem protect_system_from_string(const char *s) _pure_; +const char* protect_proc_to_string(ProtectProc i) _const_; +ProtectProc protect_proc_from_string(const char *s) _pure_; + +const char* proc_subset_to_string(ProcSubset i) _const_; +ProcSubset proc_subset_from_string(const char *s) _pure_; + void bind_mount_free_many(BindMount *b, size_t n); int bind_mount_add(BindMount **b, size_t *n, const BindMount *item); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 7fd2595c0b..d010d3bf3e 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -855,6 +855,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con "RuntimeDirectoryPreserve", "Personality", "KeyringMode", + "ProtectProc", + "ProcSubset", "NetworkNamespacePath", "LogNamespace")) return bus_append_string(m, field, eq); diff --git a/src/test/test-ns.c b/src/test/test-ns.c index d3804b50d7..29f6dc5e1f 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -36,6 +36,8 @@ int main(int argc, char *argv[]) { .protect_control_groups = true, .protect_kernel_tunables = true, .protect_kernel_modules = true, + .protect_proc = PROTECT_PROC_NOACCESS, + .proc_subset = PROC_SUBSET_PID, }; char *root_directory; diff --git a/test/fuzz/fuzz-unit-file/directives.service b/test/fuzz/fuzz-unit-file/directives.service index dbff9ab2cc..224ccffb92 100644 --- a/test/fuzz/fuzz-unit-file/directives.service +++ b/test/fuzz/fuzz-unit-file/directives.service @@ -782,6 +782,8 @@ KEYMAP= KEYMAP_TOGGLE= KeepFree= KeyringMode= +ProtectProc= +ProcSubset= KillExcludeUsers= KillOnlyUsers= KillSignal=