Merge pull request #16681 from poettering/hidepid

core: introduce ProtectProc= unit file setting for exposing procfs' hidepid= mount option
This commit is contained in:
Lennart Poettering 2020-08-25 07:47:05 +02:00 committed by GitHub
commit 6944adbbe0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 286 additions and 60 deletions

View File

@ -151,6 +151,8 @@ All execution-related settings are available for transient units.
✓ TimerSlackNSec=
✓ NoNewPrivileges=
✓ KeyringMode=
✓ ProtectProc=
✓ ProcSubset=
✓ SystemCallFilter=
✓ SystemCallArchitectures=
✓ SystemCallErrorNumber=

View File

@ -267,6 +267,55 @@
<xi:include href="system-only.xml" xpointer="singular"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>ProtectProc=</varname></term>
<listitem><para>Takes one of <literal>noaccess</literal>, <literal>invisible</literal>,
<literal>ptraceable</literal> or <literal>default</literal> (which it defaults to). When set, this
controls the <literal>hidepid=</literal> mount option of the <literal>procfs</literal> instance for
the unit that controls which directories with process metainformation
(<filename>/proc/<replaceable>PID</replaceable></filename>) are visible and accessible: when set to
<literal>noaccess</literal> the ability to access most of other users' process metadata in
<filename>/proc/</filename> is taken away for processes of the service. When set to
<literal>invisible</literal> processes owned by other users are hidden from
<filename>/proc/</filename>. If <literal>ptraceable</literal> all processes that cannot be
<function>ptrace()</function>'ed by a process are hidden to it. If <literal>default</literal> no
restrictions on <filename>/proc/</filename> access or visibility are made. For further details see
<ulink url="https://www.kernel.org/doc/html/latest/filesystems/proc.html#mount-options">The /proc
Filesystem</ulink>. It is generally recommended to run most system services with this option set to
<literal>invisible</literal>. This option is implemented via file system namespacing, and thus cannot
be used with services that shall be able to install mount points in the host file system
hierarchy. It also cannot be used for services that need to access metainformation about other users'
processes. This option implies <varname>MountAPIVFS=</varname>.</para>
<para>If the kernel doesn't support per-mount point <option>hidepid=</option> mount options this
setting remains without effect, and the unit's processes will be able to access and see other process
as if the option was not used.</para>
<xi:include href="system-only.xml" xpointer="singular"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>ProcSubset=</varname></term>
<listitem><para>Takes one of <literal>all</literal> (the default) and <literal>pid</literal>. If
the latter all files and directories not directly associated with process management and introspection
are made invisible in the <filename>/proc/</filename> file system configured for the unit's
processes. This controls the <literal>subset=</literal> mount option of the <literal>procfs</literal>
instance for the unit. For further details see <ulink
url="https://www.kernel.org/doc/html/latest/filesystems/proc.html#mount-options">The /proc
Filesystem</ulink>. Note that Linux exposes various kernel APIs via <filename>/proc/</filename>,
which are made unavailable with this setting. Since these APIs are used frequently this option is
useful only in a few, specific cases, and is not suitable for most non-trivial programs.</para>
<para>Much like <varname>ProtectProc=</varname> above, this is implemented via file system mount
namespacing, and hence the same restrictions apply: it is only available to system services, it
disables mount propagation to the host mount table, and it implies
<varname>MountAPIVFS=</varname>. Also, like <varname>ProtectProc=</varname> this setting is gracefully
disabled if the used kernel does not support the <literal>subset=</literal> mount option of
<literal>procfs</literal>.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>BindPaths=</varname></term>
<term><varname>BindReadOnlyPaths=</varname></term>

View File

@ -50,6 +50,8 @@ struct security_info {
bool ip_filters_custom_egress;
char *keyring_mode;
char *protect_proc;
char *proc_subset;
bool lock_personality;
bool memory_deny_write_execute;
bool no_new_privileges;
@ -135,6 +137,8 @@ static void security_info_free(struct security_info *i) {
free(i->root_image);
free(i->keyring_mode);
free(i->protect_proc);
free(i->proc_subset);
free(i->notify_access);
free(i->device_policy);
@ -388,6 +392,44 @@ static int assess_keyring_mode(
return 0;
}
static int assess_protect_proc(
const struct security_assessor *a,
const struct security_info *info,
const void *data,
uint64_t *ret_badness,
char **ret_description) {
assert(ret_badness);
assert(ret_description);
if (streq_ptr(info->protect_proc, "noaccess"))
*ret_badness = 1;
else if (STRPTR_IN_SET(info->protect_proc, "invisible", "ptraceable"))
*ret_badness = 0;
else
*ret_badness = 3;
*ret_description = NULL;
return 0;
}
static int assess_proc_subset(
const struct security_assessor *a,
const struct security_info *info,
const void *data,
uint64_t *ret_badness,
char **ret_description) {
assert(ret_badness);
assert(ret_description);
*ret_badness = !streq_ptr(info->proc_subset, "pid");
*ret_description = NULL;
return 0;
}
static int assess_notify_access(
const struct security_assessor *a,
const struct security_info *info,
@ -1149,6 +1191,24 @@ static const struct security_assessor security_assessor_table[] = {
.range = 1,
.assess = assess_keyring_mode,
},
{
.id = "ProtectProc=",
.url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ProtectProc=",
.description_good = "Service has restricted access to process tree (/proc hidepid=)",
.description_bad = "Service has full access to process tree (/proc hidepid=)",
.weight = 1000,
.range = 3,
.assess = assess_protect_proc,
},
{
.id = "ProcSubset=",
.url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ProcSubset=",
.description_good = "Service has no access to non-process /proc files (/proc subset=)",
.description_bad = "Service has full access to non-process /proc files (/proc subset=)",
.weight = 10,
.range = 1,
.assess = assess_proc_subset,
},
{
.id = "NotifyAccess=",
.url = "https://www.freedesktop.org/software/systemd/man/systemd.exec.html#NotifyAccess=",
@ -1908,6 +1968,8 @@ static int acquire_security_info(sd_bus *bus, const char *name, struct security_
{ "IPEgressFilterPath", "as", property_read_ip_filters, 0 },
{ "Id", "s", NULL, offsetof(struct security_info, id) },
{ "KeyringMode", "s", NULL, offsetof(struct security_info, keyring_mode) },
{ "ProtectProc", "s", NULL, offsetof(struct security_info, protect_proc) },
{ "ProcSubset", "s", NULL, offsetof(struct security_info, proc_subset) },
{ "LoadState", "s", NULL, offsetof(struct security_info, load_state) },
{ "LockPersonality", "b", NULL, offsetof(struct security_info, lock_personality) },
{ "MemoryDenyWriteExecute", "b", NULL, offsetof(struct security_info, memory_deny_write_execute) },

View File

@ -47,6 +47,8 @@ static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_input, exec_input, ExecInp
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
@ -1016,6 +1018,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("TemporaryFileSystem", "a(ss)", property_get_temporary_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectProc", "s", property_get_protect_proc, offsetof(ExecContext, protect_proc), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
@ -1354,6 +1358,8 @@ static BUS_DEFINE_SET_TRANSIENT_PARSE(utmp_mode, ExecUtmpMode, exec_utmp_mode_fr
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_system, ProtectSystem, protect_system_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality);
static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check);
@ -1706,6 +1712,12 @@ int bus_exec_context_set_transient_property(
if (streq(name, "KeyringMode"))
return bus_set_transient_keyring_mode(u, name, &c->keyring_mode, message, flags, error);
if (streq(name, "ProtectProc"))
return bus_set_transient_protect_proc(u, name, &c->protect_proc, message, flags, error);
if (streq(name, "ProcSubset"))
return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error);
if (streq(name, "RuntimeDirectoryPreserve"))
return bus_set_transient_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error);

View File

@ -1948,7 +1948,9 @@ static bool exec_needs_mount_namespace(
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
context->protect_control_groups)
context->protect_control_groups ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL)
return true;
if (context->root_directory) {
@ -2650,6 +2652,10 @@ static int apply_mount_namespace(
.protect_hostname = context->protect_hostname,
.mount_apivfs = context->mount_apivfs,
.private_mounts = context->private_mounts,
.protect_home = context->protect_home,
.protect_system = context->protect_system,
.protect_proc = context->protect_proc,
.proc_subset = context->proc_subset,
};
} else if (!context->dynamic_user && root_dir)
/*
@ -2680,8 +2686,6 @@ static int apply_mount_namespace(
tmp_dir,
var_tmp_dir,
context->log_namespace,
needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
context->mount_flags,
context->root_hash, context->root_hash_size, context->root_hash_path,
context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
@ -4601,7 +4605,9 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
"%sRestrictRealtime: %s\n"
"%sRestrictSUIDSGID: %s\n"
"%sKeyringMode: %s\n"
"%sProtectHostname: %s\n",
"%sProtectHostname: %s\n"
"%sProtectProc: %s\n"
"%sProcSubset: %s\n",
prefix, c->umask,
prefix, c->working_directory ? c->working_directory : "/",
prefix, c->root_directory ? c->root_directory : "/",
@ -4623,7 +4629,9 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
prefix, yes_no(c->restrict_realtime),
prefix, yes_no(c->restrict_suid_sgid),
prefix, exec_keyring_mode_to_string(c->keyring_mode),
prefix, yes_no(c->protect_hostname));
prefix, yes_no(c->protect_hostname),
prefix, protect_proc_to_string(c->protect_proc),
prefix, proc_subset_to_string(c->proc_subset));
if (c->root_image)
fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);

View File

@ -260,6 +260,9 @@ struct ExecContext {
char *log_namespace;
ProtectProc protect_proc; /* hidepid= */
ProcSubset proc_subset; /* subset= */
bool private_tmp;
bool private_network;
bool private_devices;

View File

@ -73,6 +73,8 @@ $1.AmbientCapabilities, config_parse_capability_set, 0,
$1.TimerSlackNSec, config_parse_nsec, 0, offsetof($1, exec_context.timer_slack_nsec)
$1.NoNewPrivileges, config_parse_bool, 0, offsetof($1, exec_context.no_new_privileges)
$1.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof($1, exec_context.keyring_mode)
$1.ProtectProc, config_parse_protect_proc, 0, offsetof($1, exec_context.protect_proc)
$1.ProcSubset, config_parse_proc_subset, 0, offsetof($1, exec_context.proc_subset)
m4_ifdef(`HAVE_SECCOMP',
`$1.SystemCallFilter, config_parse_syscall_filter, 0, offsetof($1, exec_context)
$1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs)

View File

@ -118,6 +118,8 @@ DEFINE_CONFIG_PARSE(config_parse_exec_secure_bits, secure_bits_from_string, "Fai
DEFINE_CONFIG_PARSE_ENUM(config_parse_collect_mode, collect_mode, CollectMode, "Failed to parse garbage collection mode");
DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGroupDevicePolicy, "Failed to parse device policy");
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode, "Failed to parse keyring mode");
DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc, "Failed to parse /proc/ protection mode");
DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset, "Failed to parse /proc/ subset mode");
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode, "Failed to parse utmp mode");
DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode, "Failed to parse job mode");
DEFINE_CONFIG_PARSE_ENUM(config_parse_notify_access, notify_access, NotifyAccess, "Failed to parse notify access specifier");

View File

@ -108,6 +108,8 @@ CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat);
CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces);
CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);
CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset);
CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);

View File

@ -97,7 +97,7 @@ static const MountEntry protect_kernel_tunables_table[] = {
{ "/proc/latency_stats", READONLY, true },
{ "/proc/mtrr", READONLY, true },
{ "/proc/scsi", READONLY, true },
{ "/proc/sys", READONLY, false },
{ "/proc/sys", READONLY, true },
{ "/proc/sysrq-trigger", READONLY, true },
{ "/proc/timer_stats", READONLY, true },
{ "/sys", READONLY, false },
@ -863,33 +863,66 @@ static int mount_sysfs(const MountEntry *m) {
return 1;
}
static int mount_procfs(const MountEntry *m) {
int r;
static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
const char *entry_path;
assert(m);
assert(ns_info);
(void) mkdir_p_label(mount_entry_path(m), 0755);
entry_path = mount_entry_path(m);
r = path_is_mount_point(mount_entry_path(m), NULL, 0);
if (r < 0)
return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
if (r > 0) /* make this a NOP if /proc is already a mount point */
return 0;
/* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
* one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
* our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
* mounted on /proc/ first. */
/* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
(void) mkdir_p_label(entry_path, 0755);
(void) umount_recursive(entry_path, 0);
if (ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
ns_info->proc_subset != PROC_SUBSET_ALL) {
_cleanup_free_ char *opts = NULL;
/* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
* pretended to be per-instance but actually was per-namespace), hence let's make use of it
* if requested. To make sure this logic succeeds only on kernels where hidepid= is
* per-instance, we'll exclusively use the textual value for hidepid=, since support was
* added in the same commit: if it's supported it is thus also per-instance. */
opts = strjoin("hidepid=",
ns_info->protect_proc == PROTECT_PROC_DEFAULT ? "off" :
protect_proc_to_string(ns_info->protect_proc),
ns_info->proc_subset == PROC_SUBSET_PID ? ",subset=pid" : "");
if (!opts)
return -ENOMEM;
if (mount("proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts) < 0) {
if (errno != EINVAL)
return log_debug_errno(errno, "Failed to mount %s (options=%s): %m", mount_entry_path(m), opts);
/* If this failed with EINVAL then this likely means the textual hidepid= stuff is
* not supported by the kernel, and thus the per-instance hidepid= neither, which
* means we really don't want to use it, since it would affect our host's /proc
* mount. Hence let's gracefully fallback to a classic, unrestricted version. */
} else
return 1;
}
if (mount("proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
return log_debug_errno(errno, "Failed to mount %s (no options): %m", mount_entry_path(m));
return 1;
}
static int mount_tmpfs(const MountEntry *m) {
const char *entry_path, *inner_path;
int r;
const char *entry_path = mount_entry_path(m);
const char *source_path = m->path_const;
assert(m);
entry_path = mount_entry_path(m);
inner_path = m->path_const;
/* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
(void) mkdir_p_label(entry_path, 0755);
@ -898,9 +931,9 @@ static int mount_tmpfs(const MountEntry *m) {
if (mount("tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m)) < 0)
return log_debug_errno(errno, "Failed to mount %s: %m", entry_path);
r = label_fix_container(entry_path, source_path, 0);
r = label_fix_container(entry_path, inner_path, 0);
if (r < 0)
return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, source_path);
return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path);
return 1;
}
@ -995,7 +1028,8 @@ static int follow_symlink(
static int apply_mount(
const char *root_directory,
MountEntry *m) {
MountEntry *m,
const NamespaceInfo *ns_info) {
_cleanup_free_ char *inaccessible = NULL;
bool rbind = true, make = false;
@ -1003,6 +1037,7 @@ static int apply_mount(
int r;
assert(m);
assert(ns_info);
log_debug("Applying namespace mount on %s", mount_entry_path(m));
@ -1107,7 +1142,7 @@ static int apply_mount(
return mount_sysfs(m);
case PROCFS:
return mount_procfs(m);
return mount_procfs(m, ns_info);
case MOUNT_IMAGES:
return mount_images(m);
@ -1219,7 +1254,9 @@ static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) {
return ns_info->mount_apivfs ||
ns_info->protect_control_groups ||
ns_info->protect_kernel_tunables;
ns_info->protect_kernel_tunables ||
ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
ns_info->proc_subset != PROC_SUBSET_ALL;
}
static size_t namespace_calculate_mounts(
@ -1233,25 +1270,23 @@ static size_t namespace_calculate_mounts(
size_t n_mount_images,
const char* tmp_dir,
const char* var_tmp_dir,
const char* log_namespace,
ProtectHome protect_home,
ProtectSystem protect_system) {
const char* log_namespace) {
size_t protect_home_cnt;
size_t protect_system_cnt =
(protect_system == PROTECT_SYSTEM_STRICT ?
(ns_info->protect_system == PROTECT_SYSTEM_STRICT ?
ELEMENTSOF(protect_system_strict_table) :
((protect_system == PROTECT_SYSTEM_FULL) ?
((ns_info->protect_system == PROTECT_SYSTEM_FULL) ?
ELEMENTSOF(protect_system_full_table) :
((protect_system == PROTECT_SYSTEM_YES) ?
((ns_info->protect_system == PROTECT_SYSTEM_YES) ?
ELEMENTSOF(protect_system_yes_table) : 0)));
protect_home_cnt =
(protect_home == PROTECT_HOME_YES ?
(ns_info->protect_home == PROTECT_HOME_YES ?
ELEMENTSOF(protect_home_yes_table) :
((protect_home == PROTECT_HOME_READ_ONLY) ?
((ns_info->protect_home == PROTECT_HOME_READ_ONLY) ?
ELEMENTSOF(protect_home_read_only_table) :
((protect_home == PROTECT_HOME_TMPFS) ?
((ns_info->protect_home == PROTECT_HOME_TMPFS) ?
ELEMENTSOF(protect_home_tmpfs_table) : 0)));
return !!tmp_dir + !!var_tmp_dir +
@ -1355,8 +1390,6 @@ int setup_namespace(
const char* tmp_dir,
const char* var_tmp_dir,
const char *log_namespace,
ProtectHome protect_home,
ProtectSystem protect_system,
unsigned long mount_flags,
const void *root_hash,
size_t root_hash_size,
@ -1389,10 +1422,10 @@ int setup_namespace(
/* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
if (root_read_only(read_only_paths,
protect_system) &&
ns_info->protect_system) &&
home_read_only(read_only_paths, inaccessible_paths, empty_directories,
bind_mounts, n_bind_mounts, temporary_filesystems, n_temporary_filesystems,
protect_home) &&
ns_info->protect_home) &&
strv_isempty(read_write_paths))
dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
@ -1461,8 +1494,7 @@ int setup_namespace(
n_temporary_filesystems,
n_mount_images,
tmp_dir, var_tmp_dir,
log_namespace,
protect_home, protect_system);
log_namespace);
if (n_mounts > 0) {
m = mounts = new0(MountEntry, n_mounts);
@ -1559,11 +1591,11 @@ int setup_namespace(
};
}
r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
r = append_protect_home(&m, ns_info->protect_home, ns_info->ignore_protect_paths);
if (r < 0)
goto finish;
r = append_protect_system(&m, protect_system, false);
r = append_protect_system(&m, ns_info->protect_system, false);
if (r < 0)
goto finish;
@ -1720,7 +1752,7 @@ int setup_namespace(
break;
}
r = apply_mount(root, m);
r = apply_mount(root, m, ns_info);
if (r < 0) {
if (error_path && mount_entry_path(m))
*error_path = strdup(mount_entry_path(m));
@ -2240,3 +2272,19 @@ static const char* const namespace_type_table[] = {
};
DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);
static const char* const protect_proc_table[_PROTECT_PROC_MAX] = {
[PROTECT_PROC_DEFAULT] = "default",
[PROTECT_PROC_NOACCESS] = "noaccess",
[PROTECT_PROC_INVISIBLE] = "invisible",
[PROTECT_PROC_PTRACEABLE] = "ptraceable",
};
DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc);
static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
[PROC_SUBSET_ALL] = "all",
[PROC_SUBSET_PID] = "pid",
};
DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);

View File

@ -47,6 +47,22 @@ typedef enum ProtectSystem {
_PROTECT_SYSTEM_INVALID = -1
} ProtectSystem;
typedef enum ProtectProc {
PROTECT_PROC_DEFAULT,
PROTECT_PROC_NOACCESS, /* hidepid=noaccess */
PROTECT_PROC_INVISIBLE, /* hidepid=invisible */
PROTECT_PROC_PTRACEABLE, /* hidepid=ptraceable */
_PROTECT_PROC_MAX,
_PROTECT_PROC_INVALID = -1,
} ProtectProc;
typedef enum ProcSubset {
PROC_SUBSET_ALL,
PROC_SUBSET_PID, /* subset=pid */
_PROC_SUBSET_MAX,
_PROC_SUBSET_INVALID = -1,
} ProcSubset;
struct NamespaceInfo {
bool ignore_protect_paths:1;
bool private_dev:1;
@ -57,6 +73,10 @@ struct NamespaceInfo {
bool protect_kernel_logs:1;
bool mount_apivfs:1;
bool protect_hostname:1;
ProtectHome protect_home;
ProtectSystem protect_system;
ProtectProc protect_proc;
ProcSubset proc_subset;
};
struct BindMount {
@ -98,8 +118,6 @@ int setup_namespace(
const char *tmp_dir,
const char *var_tmp_dir,
const char *log_namespace,
ProtectHome protect_home,
ProtectSystem protect_system,
unsigned long mount_flags,
const void *root_hash,
size_t root_hash_size,
@ -135,6 +153,12 @@ ProtectHome protect_home_from_string(const char *s) _pure_;
const char* protect_system_to_string(ProtectSystem p) _const_;
ProtectSystem protect_system_from_string(const char *s) _pure_;
const char* protect_proc_to_string(ProtectProc i) _const_;
ProtectProc protect_proc_from_string(const char *s) _pure_;
const char* proc_subset_to_string(ProcSubset i) _const_;
ProcSubset proc_subset_from_string(const char *s) _pure_;
void bind_mount_free_many(BindMount *b, size_t n);
int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);

View File

@ -855,6 +855,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
"RuntimeDirectoryPreserve",
"Personality",
"KeyringMode",
"ProtectProc",
"ProcSubset",
"NetworkNamespacePath",
"LogNamespace"))
return bus_append_string(m, field, eq);

View File

@ -163,8 +163,6 @@ static void test_protect_kernel_logs(void) {
NULL,
NULL,
NULL,
PROTECT_HOME_NO,
PROTECT_SYSTEM_NO,
0,
NULL,
0,

View File

@ -36,6 +36,8 @@ int main(int argc, char *argv[]) {
.protect_control_groups = true,
.protect_kernel_tunables = true,
.protect_kernel_modules = true,
.protect_proc = PROTECT_PROC_NOACCESS,
.proc_subset = PROC_SUBSET_PID,
};
char *root_directory;
@ -76,8 +78,6 @@ int main(int argc, char *argv[]) {
tmp_dir,
var_tmp_dir,
NULL,
PROTECT_HOME_NO,
PROTECT_SYSTEM_NO,
0,
NULL,
0,

View File

@ -782,6 +782,8 @@ KEYMAP=
KEYMAP_TOGGLE=
KeepFree=
KeyringMode=
ProtectProc=
ProcSubset=
KillExcludeUsers=
KillOnlyUsers=
KillSignal=

View File

@ -23,11 +23,12 @@ NoNewPrivileges=yes
PrivateDevices=yes
PrivateNetwork=yes
PrivateTmp=yes
ProtectProc=invisible
ProtectControlGroups=yes
ProtectHome=yes
ProtectKernelLogs=yes
ProtectKernelModules=yes
ProtectKernelTunables=yes
ProtectKernelLogs=yes
ProtectSystem=strict
ReadWritePaths=/etc
RestrictAddressFamilies=AF_UNIX

View File

@ -19,12 +19,13 @@ LockPersonality=yes
MemoryDenyWriteExecute=yes
PrivateDevices=yes
PrivateNetwork=yes
ProtectProc=invisible
ProtectControlGroups=yes
ProtectHome=yes
ProtectHostname=yes
ProtectKernelLogs=yes
ProtectKernelModules=yes
ProtectKernelTunables=yes
ProtectKernelLogs=yes
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
RestrictNamespaces=yes
RestrictRealtime=yes

View File

@ -21,13 +21,14 @@ NoNewPrivileges=yes
PrivateDevices=yes
PrivateNetwork=yes
PrivateTmp=yes
ProtectProc=invisible
ProtectClock=yes
ProtectControlGroups=yes
ProtectHome=yes
ProtectHostname=yes
ProtectKernelLogs=yes
ProtectKernelModules=yes
ProtectKernelTunables=yes
ProtectKernelLogs=yes
ProtectSystem=strict
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
RestrictNamespaces=yes

View File

@ -19,12 +19,13 @@ ExecStart=@rootlibexecdir@/systemd-journal-upload --save-state
LockPersonality=yes
MemoryDenyWriteExecute=yes
PrivateDevices=yes
ProtectProc=invisible
ProtectControlGroups=yes
ProtectHome=yes
ProtectHostname=yes
ProtectKernelLogs=yes
ProtectKernelModules=yes
ProtectKernelTunables=yes
ProtectKernelLogs=yes
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
RestrictNamespaces=yes
RestrictRealtime=yes

View File

@ -23,12 +23,13 @@ NoNewPrivileges=yes
PrivateDevices=yes
PrivateNetwork=yes
PrivateTmp=yes
ProtectProc=invisible
ProtectControlGroups=yes
ProtectHome=yes
ProtectHostname=yes
ProtectKernelLogs=yes
ProtectKernelModules=yes
ProtectKernelTunables=yes
ProtectKernelLogs=yes
ProtectSystem=strict
ReadWritePaths=/etc
RestrictAddressFamilies=AF_UNIX

View File

@ -28,7 +28,6 @@ DeviceAllow=char-drm rw
DeviceAllow=char-input rw
DeviceAllow=char-tty rw
DeviceAllow=char-vcs rw
# Make sure the DeviceAllow= lines above can work correctly when referenceing char-drm
ExecStart=@rootlibexecdir@/systemd-logind
FileDescriptorStoreMax=512
IPAddressDeny=any
@ -36,12 +35,13 @@ LockPersonality=yes
MemoryDenyWriteExecute=yes
NoNewPrivileges=yes
PrivateTmp=yes
ProtectProc=invisible
ProtectClock=yes
ProtectControlGroups=yes
ProtectHome=yes
ProtectHostname=yes
ProtectKernelModules=yes
ProtectKernelLogs=yes
ProtectKernelModules=yes
ProtectSystem=strict
ReadWritePaths=/etc /run
Restart=always

View File

@ -26,13 +26,15 @@ ExecStart=!!@rootlibexecdir@/systemd-networkd
LockPersonality=yes
MemoryDenyWriteExecute=yes
NoNewPrivileges=yes
ProtectProc=invisible
ProtectClock=yes
ProtectControlGroups=yes
ProtectHome=yes
ProtectKernelModules=yes
ProtectKernelLogs=yes
ProtectKernelModules=yes
ProtectSystem=strict
Restart=on-failure
RestartKillSignal=SIGUSR2
RestartSec=0
RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6 AF_PACKET AF_ALG
RestrictNamespaces=yes
@ -44,7 +46,6 @@ SystemCallArchitectures=native
SystemCallErrorNumber=EPERM
SystemCallFilter=@system-service
Type=notify
RestartKillSignal=SIGUSR2
User=systemd-network
@SERVICE_WATCHDOG@

View File

@ -28,12 +28,13 @@ MemoryDenyWriteExecute=yes
NoNewPrivileges=yes
PrivateDevices=yes
PrivateTmp=yes
ProtectProc=invisible
ProtectClock=yes
ProtectControlGroups=yes
ProtectHome=yes
ProtectKernelLogs=yes
ProtectKernelModules=yes
ProtectKernelTunables=yes
ProtectKernelLogs=yes
ProtectSystem=strict
Restart=always
RestartSec=0

View File

@ -22,12 +22,13 @@ LockPersonality=yes
MemoryDenyWriteExecute=yes
NoNewPrivileges=yes
PrivateTmp=yes
ProtectProc=invisible
ProtectControlGroups=yes
ProtectHome=yes
ProtectHostname=yes
ProtectKernelLogs=yes
ProtectKernelModules=yes
ProtectKernelTunables=yes
ProtectKernelLogs=yes
ProtectSystem=strict
ReadWritePaths=/etc
RestrictAddressFamilies=AF_UNIX

View File

@ -27,12 +27,13 @@ MemoryDenyWriteExecute=yes
NoNewPrivileges=yes
PrivateDevices=yes
PrivateTmp=yes
ProtectProc=invisible
ProtectControlGroups=yes
ProtectHome=yes
ProtectHostname=yes
ProtectKernelLogs=yes
ProtectKernelModules=yes
ProtectKernelTunables=yes
ProtectKernelLogs=yes
ProtectSystem=strict
Restart=always
RestartSec=0

View File

@ -24,6 +24,7 @@ LockPersonality=yes
MemoryDenyWriteExecute=yes
NoNewPrivileges=yes
PrivateDevices=yes
ProtectProc=invisible
ProtectControlGroups=yes
ProtectHome=yes
ProtectHostname=yes