Merge pull request #4536 from poettering/seccomp-namespaces
core: add new RestrictNamespaces= unit file setting Merging, not rebasing, because this touches many files and there were tree-wide cleanups in the mean time.
This commit is contained in:
commit
d85a0f8028
|
@ -1046,7 +1046,9 @@ libshared_la_SOURCES = \
|
|||
src/shared/tests.h \
|
||||
src/shared/tests.c \
|
||||
src/shared/fdset.c \
|
||||
src/shared/fdset.h
|
||||
src/shared/fdset.h \
|
||||
src/shared/nsflags.h \
|
||||
src/shared/nsflags.c
|
||||
|
||||
if HAVE_UTMP
|
||||
libshared_la_SOURCES += \
|
||||
|
|
6
TODO
6
TODO
|
@ -56,14 +56,10 @@ Features:
|
|||
|
||||
* define gpt header bits to select volatility mode
|
||||
|
||||
* nspawn: mount loopback filesystems with "discard"
|
||||
|
||||
* ProtectKernelLogs= (drops CAP_SYSLOG, add seccomp for syslog() syscall, and DeviceAllow to /dev/kmsg) in service files
|
||||
|
||||
* ProtectClock= (drops CAP_SYS_TIMES, adds seecomp filters for settimeofday, adjtimex), sets DeviceAllow o /dev/rtc
|
||||
|
||||
* ProtectKernelModules= (drops CAP_SYS_MODULE and filters the kmod syscalls)
|
||||
|
||||
* ProtectTracing= (drops CAP_SYS_PTRACE, blocks ptrace syscall, makes /sys/kernel/tracing go away)
|
||||
|
||||
* ProtectMount= (drop mount/umount/pivot_root from seccomp, disallow fuse via DeviceAllow, imply Mountflags=slave)
|
||||
|
@ -85,8 +81,6 @@ Features:
|
|||
|
||||
* Add RootImage= for mounting a disk image or file as root directory
|
||||
|
||||
* RestrictNamespaces= or so in services (taking away the ability to create namespaces, with setns, unshare, clone)
|
||||
|
||||
* make sure the ratelimit object can deal with USEC_INFINITY as way to turn off things
|
||||
|
||||
* journalctl: make sure -f ends when the container indicated by -M terminates
|
||||
|
|
|
@ -1234,22 +1234,16 @@
|
|||
<varlistentry>
|
||||
<term><varname>NoNewPrivileges=</varname></term>
|
||||
|
||||
<listitem><para>Takes a boolean argument. If true, ensures that the service
|
||||
process and all its children can never gain new privileges through
|
||||
<function>execve</function> (e.g. via setuid or setgid bits, or filesystem
|
||||
capabilities). This is the simplest and most effective way to ensure that
|
||||
a process and its children can never elevate privileges again. Defaults to false,
|
||||
but in the user manager instance certain settings force
|
||||
<varname>NoNewPrivileges=yes</varname>, ignoring the value of this setting.
|
||||
This is the case when <varname>SystemCallFilter=</varname>,
|
||||
<varname>SystemCallArchitectures=</varname>,
|
||||
<varname>RestrictAddressFamilies=</varname>,
|
||||
<varname>PrivateDevices=</varname>,
|
||||
<varname>ProtectKernelTunables=</varname>,
|
||||
<varname>ProtectKernelModules=</varname>,
|
||||
<varname>MemoryDenyWriteExecute=</varname>, or
|
||||
<varname>RestrictRealtime=</varname> are specified.
|
||||
</para></listitem>
|
||||
<listitem><para>Takes a boolean argument. If true, ensures that the service process and all its children can
|
||||
never gain new privileges through <function>execve()</function> (e.g. via setuid or setgid bits, or filesystem
|
||||
capabilities). This is the simplest and most effective way to ensure that a process and its children can never
|
||||
elevate privileges again. Defaults to false, but in the user manager instance certain settings force
|
||||
<varname>NoNewPrivileges=yes</varname>, ignoring the value of this setting. This is the case when
|
||||
<varname>SystemCallFilter=</varname>, <varname>SystemCallArchitectures=</varname>,
|
||||
<varname>RestrictAddressFamilies=</varname>, <varname>RestrictNamespaces=</varname>,
|
||||
<varname>PrivateDevices=</varname>, <varname>ProtectKernelTunables=</varname>,
|
||||
<varname>ProtectKernelModules=</varname>, <varname>MemoryDenyWriteExecute=</varname>, or
|
||||
<varname>RestrictRealtime=</varname> are specified.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
@ -1467,6 +1461,30 @@
|
|||
logging. This does not affect commands prefixed with <literal>+</literal>.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>RestrictNamespaces=</varname></term>
|
||||
|
||||
<listitem><para>Restricts access to Linux namespace functionality for the processes of this unit. For details
|
||||
about Linux namespaces, see
|
||||
<citerefentry><refentrytitle>namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>. Either takes a
|
||||
boolean argument, or a space-separated list of namespace type identifiers. If false (the default), no
|
||||
restrictions on namespace creation and switching are made. If true, access to any kind of namespacing is
|
||||
prohibited. Otherwise, a space-separated list of namespace type identifiers must be specified, consisting of
|
||||
any combination of: <constant>cgroup</constant>, <constant>ipc</constant>, <constant>net</constant>,
|
||||
<constant>mnt</constant>, <constant>pid</constant>, <constant>user</constant> and <constant>uts</constant>. Any
|
||||
namespace type listed is made accessible to the unit's processes, access to namespace types not listed is
|
||||
prohibited (whitelisting). By prepending the list with a single tilda character (<literal>~</literal>) the
|
||||
effect may be inverted: only the listed namespace types will be made inaccessible, all unlisted ones are
|
||||
permitted (blacklisting). If the empty string is assigned, the default namespace restrictions are applied,
|
||||
which is equivalent to false. Internally, this setting limits access to the
|
||||
<citerefentry><refentrytitle>unshare</refentrytitle><manvolnum>2</manvolnum></citerefentry>,
|
||||
<citerefentry><refentrytitle>clone</refentrytitle><manvolnum>2</manvolnum></citerefentry> and
|
||||
<citerefentry><refentrytitle>setns</refentrytitle><manvolnum>2</manvolnum></citerefentry> system calls, taking
|
||||
the specified flags parameters into account. Note that — if this option is used — in addition to restricting
|
||||
creation and switching of the specified types of namespaces (or all of them, if true) access to the
|
||||
<function>setns()</function> system call with a zero flags parameter is prohibited.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>ProtectKernelModules=</varname></term>
|
||||
|
||||
|
|
|
@ -781,6 +781,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
|
|||
SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("RestrictNamespace", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_VTABLE_END
|
||||
};
|
||||
|
||||
|
@ -1591,7 +1592,27 @@ int bus_exec_context_set_transient_property(
|
|||
}
|
||||
|
||||
return 1;
|
||||
} else if (streq(name, "RestrictNamespaces")) {
|
||||
uint64_t flags;
|
||||
|
||||
r = sd_bus_message_read(message, "t", &flags);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if ((flags & NAMESPACE_FLAGS_ALL) != flags)
|
||||
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown namespace types");
|
||||
|
||||
if (mode != UNIT_CHECK) {
|
||||
_cleanup_free_ char *s = NULL;
|
||||
|
||||
r = namespace_flag_to_string_many(flags, &s);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
c->restrict_namespaces = flags;
|
||||
unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, s);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
ri = rlimit_from_string(name);
|
||||
|
|
|
@ -1534,6 +1534,18 @@ static int apply_private_devices(const Unit *u, const ExecContext *c) {
|
|||
return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
|
||||
}
|
||||
|
||||
static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
|
||||
assert(c);
|
||||
|
||||
if (!exec_context_restrict_namespaces_set(c))
|
||||
return 0;
|
||||
|
||||
if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
|
||||
return 0;
|
||||
|
||||
return seccomp_restrict_namespaces(c->restrict_namespaces);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void do_idle_pipe_dance(int idle_pipe[4]) {
|
||||
|
@ -2183,6 +2195,7 @@ static bool context_has_no_new_privileges(const ExecContext *c) {
|
|||
return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */
|
||||
c->memory_deny_write_execute ||
|
||||
c->restrict_realtime ||
|
||||
exec_context_restrict_namespaces_set(c) ||
|
||||
c->protect_kernel_tunables ||
|
||||
c->protect_kernel_modules ||
|
||||
c->private_devices ||
|
||||
|
@ -2764,6 +2777,12 @@ static int exec_child(
|
|||
}
|
||||
}
|
||||
|
||||
r = apply_restrict_namespaces(unit, context);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_SECCOMP;
|
||||
return r;
|
||||
}
|
||||
|
||||
if (context->protect_kernel_tunables) {
|
||||
r = apply_protect_sysctl(unit, context);
|
||||
if (r < 0) {
|
||||
|
@ -2947,6 +2966,7 @@ void exec_context_init(ExecContext *c) {
|
|||
c->personality = PERSONALITY_INVALID;
|
||||
c->runtime_directory_mode = 0755;
|
||||
c->capability_bounding_set = CAP_ALL;
|
||||
c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
|
||||
}
|
||||
|
||||
void exec_context_done(ExecContext *c) {
|
||||
|
@ -3244,6 +3264,7 @@ static void strv_fprintf(FILE *f, char **l) {
|
|||
void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
|
||||
char **e, **d;
|
||||
unsigned i;
|
||||
int r;
|
||||
|
||||
assert(c);
|
||||
assert(f);
|
||||
|
@ -3524,6 +3545,15 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
|
|||
fputc('\n', f);
|
||||
}
|
||||
|
||||
if (exec_context_restrict_namespaces_set(c)) {
|
||||
_cleanup_free_ char *s = NULL;
|
||||
|
||||
r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
|
||||
if (r >= 0)
|
||||
fprintf(f, "%sRestrictNamespaces: %s\n",
|
||||
prefix, s);
|
||||
}
|
||||
|
||||
if (c->syscall_errno > 0)
|
||||
fprintf(f,
|
||||
"%sSystemCallErrorNumber: %s\n",
|
||||
|
|
|
@ -35,6 +35,7 @@ typedef struct ExecParameters ExecParameters;
|
|||
#include "list.h"
|
||||
#include "missing.h"
|
||||
#include "namespace.h"
|
||||
#include "nsflags.h"
|
||||
|
||||
typedef enum ExecUtmpMode {
|
||||
EXEC_UTMP_INIT,
|
||||
|
@ -195,6 +196,8 @@ struct ExecContext {
|
|||
|
||||
unsigned long personality;
|
||||
|
||||
unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */
|
||||
|
||||
Set *syscall_filter;
|
||||
Set *syscall_archs;
|
||||
int syscall_errno;
|
||||
|
@ -216,6 +219,12 @@ struct ExecContext {
|
|||
bool no_new_privileges_set:1;
|
||||
};
|
||||
|
||||
static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) {
|
||||
assert(c);
|
||||
|
||||
return (c->restrict_namespaces & NAMESPACE_FLAGS_ALL) != NAMESPACE_FLAGS_ALL;
|
||||
}
|
||||
|
||||
typedef enum ExecFlags {
|
||||
EXEC_CONFIRM_SPAWN = 1U << 0,
|
||||
EXEC_APPLY_PERMISSIONS = 1U << 1,
|
||||
|
|
|
@ -57,12 +57,14 @@ m4_ifdef(`HAVE_SECCOMP',
|
|||
$1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs)
|
||||
$1.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof($1, exec_context)
|
||||
$1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute)
|
||||
$1.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof($1, exec_context.restrict_namespaces)
|
||||
$1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime)
|
||||
$1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)',
|
||||
`$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
|
||||
$1.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
|
||||
$1.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
|
||||
$1.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
|
||||
$1.RestrictNamespaces, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
|
||||
$1.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
|
||||
$1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0')
|
||||
$1.LimitCPU, config_parse_limit, RLIMIT_CPU, offsetof($1, exec_context.rlimit)
|
||||
|
|
|
@ -2919,6 +2919,54 @@ int config_parse_address_families(
|
|||
set_remove(c->address_families, INT_TO_PTR(af));
|
||||
}
|
||||
}
|
||||
|
||||
int config_parse_restrict_namespaces(
|
||||
const char *unit,
|
||||
const char *filename,
|
||||
unsigned line,
|
||||
const char *section,
|
||||
unsigned section_line,
|
||||
const char *lvalue,
|
||||
int ltype,
|
||||
const char *rvalue,
|
||||
void *data,
|
||||
void *userdata) {
|
||||
|
||||
ExecContext *c = data;
|
||||
bool invert = false;
|
||||
int r;
|
||||
|
||||
if (isempty(rvalue)) {
|
||||
/* Reset to the default. */
|
||||
c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (rvalue[0] == '~') {
|
||||
invert = true;
|
||||
rvalue++;
|
||||
}
|
||||
|
||||
r = parse_boolean(rvalue);
|
||||
if (r > 0)
|
||||
c->restrict_namespaces = 0;
|
||||
else if (r == 0)
|
||||
c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
|
||||
else {
|
||||
/* Not a boolean argument, in this case it's a list of namespace types. */
|
||||
|
||||
r = namespace_flag_from_string_many(rvalue, &c->restrict_namespaces);
|
||||
if (r < 0) {
|
||||
log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse namespace type string, ignoring: %s", rvalue);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (invert)
|
||||
c->restrict_namespaces = (~c->restrict_namespaces) & NAMESPACE_FLAGS_ALL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
int config_parse_unit_slice(
|
||||
|
@ -4342,6 +4390,7 @@ void unit_dump_config_items(FILE *f) {
|
|||
{ config_parse_syscall_archs, "ARCHS" },
|
||||
{ config_parse_syscall_errno, "ERRNO" },
|
||||
{ config_parse_address_families, "FAMILIES" },
|
||||
{ config_parse_restrict_namespaces, "NAMESPACES" },
|
||||
#endif
|
||||
{ config_parse_cpu_shares, "SHARES" },
|
||||
{ config_parse_cpu_weight, "WEIGHT" },
|
||||
|
|
|
@ -116,6 +116,7 @@ int config_parse_fdname(const char *unit, const char *filename, unsigned line, c
|
|||
int config_parse_sec_fix_0(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
|
||||
int config_parse_user_group(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
|
||||
int config_parse_user_group_strv(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
|
||||
int config_parse_restrict_namespaces(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
|
||||
|
||||
/* gperf prototypes */
|
||||
const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, unsigned length);
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
#include "hashmap.h"
|
||||
#include "list.h"
|
||||
#include "locale-util.h"
|
||||
#include "nsflags.h"
|
||||
#include "parse-util.h"
|
||||
#include "path-util.h"
|
||||
#include "process-util.h"
|
||||
|
@ -553,6 +554,30 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
|
|||
|
||||
r = sd_bus_message_close_container(m);
|
||||
|
||||
} else if (streq(field, "RestrictNamespaces")) {
|
||||
bool invert = false;
|
||||
uint64_t flags = 0;
|
||||
|
||||
if (eq[0] == '~') {
|
||||
invert = true;
|
||||
eq++;
|
||||
}
|
||||
|
||||
r = parse_boolean(eq);
|
||||
if (r > 0)
|
||||
flags = 0;
|
||||
else if (r == 0)
|
||||
flags = NAMESPACE_FLAGS_ALL;
|
||||
else {
|
||||
r = namespace_flag_from_string_many(eq, &flags);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to parse %s value %s.", field, eq);
|
||||
}
|
||||
|
||||
if (invert)
|
||||
flags = (~flags) & NAMESPACE_FLAGS_ALL;
|
||||
|
||||
r = sd_bus_message_append(m, "v", "t", flags);
|
||||
} else {
|
||||
log_error("Unknown assignment %s.", assignment);
|
||||
return -EINVAL;
|
||||
|
|
|
@ -0,0 +1,126 @@
|
|||
/***
|
||||
This file is part of systemd.
|
||||
|
||||
Copyright 2016 Lennart Poettering
|
||||
|
||||
systemd is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU Lesser General Public License as published by
|
||||
the Free Software Foundation; either version 2.1 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
systemd is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with systemd; If not, see <http://www.gnu.org/licenses/>.
|
||||
***/
|
||||
|
||||
#include <sched.h>
|
||||
|
||||
#include "alloc-util.h"
|
||||
#include "extract-word.h"
|
||||
#include "nsflags.h"
|
||||
#include "seccomp-util.h"
|
||||
#include "string-util.h"
|
||||
|
||||
const struct namespace_flag_map namespace_flag_map[] = {
|
||||
{ CLONE_NEWCGROUP, "cgroup" },
|
||||
{ CLONE_NEWIPC, "ipc" },
|
||||
{ CLONE_NEWNET, "net" },
|
||||
/* So, the mount namespace flag is called CLONE_NEWNS for historical reasons. Let's expose it here under a more
|
||||
* explanatory name: "mnt". This is in-line with how the kernel exposes namespaces in /proc/$PID/ns. */
|
||||
{ CLONE_NEWNS, "mnt" },
|
||||
{ CLONE_NEWPID, "pid" },
|
||||
{ CLONE_NEWUSER, "user" },
|
||||
{ CLONE_NEWUTS, "uts" },
|
||||
{}
|
||||
};
|
||||
|
||||
const char* namespace_flag_to_string(unsigned long flag) {
|
||||
unsigned i;
|
||||
|
||||
flag &= NAMESPACE_FLAGS_ALL;
|
||||
|
||||
for (i = 0; namespace_flag_map[i].name; i++)
|
||||
if (flag == namespace_flag_map[i].flag)
|
||||
return namespace_flag_map[i].name;
|
||||
|
||||
return NULL; /* either unknown namespace flag, or a combination of many. This call supports neither. */
|
||||
}
|
||||
|
||||
unsigned long namespace_flag_from_string(const char *name) {
|
||||
unsigned i;
|
||||
|
||||
if (isempty(name))
|
||||
return 0;
|
||||
|
||||
for (i = 0; namespace_flag_map[i].name; i++)
|
||||
if (streq(name, namespace_flag_map[i].name))
|
||||
return namespace_flag_map[i].flag;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int namespace_flag_from_string_many(const char *name, unsigned long *ret) {
|
||||
unsigned long flags = 0;
|
||||
int r;
|
||||
|
||||
assert_se(ret);
|
||||
|
||||
if (!name) {
|
||||
*ret = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
_cleanup_free_ char *word = NULL;
|
||||
unsigned long f;
|
||||
|
||||
r = extract_first_word(&name, &word, NULL, 0);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r == 0)
|
||||
break;
|
||||
|
||||
f = namespace_flag_from_string(word);
|
||||
if (f == 0)
|
||||
return -EINVAL;
|
||||
|
||||
flags |= f;
|
||||
}
|
||||
|
||||
*ret = flags;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int namespace_flag_to_string_many(unsigned long flags, char **ret) {
|
||||
_cleanup_free_ char *s = NULL;
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; namespace_flag_map[i].name; i++) {
|
||||
if ((flags & namespace_flag_map[i].flag) != namespace_flag_map[i].flag)
|
||||
continue;
|
||||
|
||||
if (!s) {
|
||||
s = strdup(namespace_flag_map[i].name);
|
||||
if (!s)
|
||||
return -ENOMEM;
|
||||
} else {
|
||||
if (!strextend(&s, " ", namespace_flag_map[i].name, NULL))
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
|
||||
if (!s) {
|
||||
s = strdup("");
|
||||
if (!s)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
*ret = s;
|
||||
s = NULL;
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
#pragma once
|
||||
|
||||
/***
|
||||
This file is part of systemd.
|
||||
|
||||
Copyright 2016 Lennart Poettering
|
||||
|
||||
systemd is free software; you can redistribute it and/or modify it
|
||||
under the terms of the GNU Lesser General Public License as published by
|
||||
the Free Software Foundation; either version 2.1 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
systemd is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with systemd; If not, see <http://www.gnu.org/licenses/>.
|
||||
***/
|
||||
|
||||
#include <sched.h>
|
||||
|
||||
#include "missing.h"
|
||||
|
||||
/* The combination of all namespace flags defined by the kernel. The right type for this isn't clear. setns() and
|
||||
* unshare() expect these flags to be passed as (signed) "int", while clone() wants them as "unsigned long". The latter
|
||||
* is definitely more appropriate for a flags parameter, and also the larger type of the two, hence let's stick to that
|
||||
* here. */
|
||||
#define NAMESPACE_FLAGS_ALL \
|
||||
((unsigned long) (CLONE_NEWCGROUP| \
|
||||
CLONE_NEWIPC| \
|
||||
CLONE_NEWNET| \
|
||||
CLONE_NEWNS| \
|
||||
CLONE_NEWPID| \
|
||||
CLONE_NEWUSER| \
|
||||
CLONE_NEWUTS))
|
||||
|
||||
const char* namespace_flag_to_string(unsigned long flag);
|
||||
unsigned long namespace_flag_from_string(const char *name);
|
||||
int namespace_flag_from_string_many(const char *name, unsigned long *ret);
|
||||
int namespace_flag_to_string_many(unsigned long flags, char **ret);
|
||||
|
||||
struct namespace_flag_map {
|
||||
unsigned long flag;
|
||||
const char *name;
|
||||
};
|
||||
|
||||
extern const struct namespace_flag_map namespace_flag_map[];
|
|
@ -23,7 +23,9 @@
|
|||
#include <sys/prctl.h>
|
||||
#include <linux/seccomp.h>
|
||||
|
||||
#include "alloc-util.h"
|
||||
#include "macro.h"
|
||||
#include "nsflags.h"
|
||||
#include "seccomp-util.h"
|
||||
#include "string-util.h"
|
||||
#include "util.h"
|
||||
|
@ -576,5 +578,92 @@ int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set
|
|||
finish:
|
||||
seccomp_release(seccomp);
|
||||
return r;
|
||||
|
||||
}
|
||||
|
||||
int seccomp_restrict_namespaces(unsigned long retain) {
|
||||
scmp_filter_ctx seccomp;
|
||||
unsigned i;
|
||||
int r;
|
||||
|
||||
if (log_get_max_level() >= LOG_DEBUG) {
|
||||
_cleanup_free_ char *s = NULL;
|
||||
|
||||
(void) namespace_flag_to_string_many(retain, &s);
|
||||
log_debug("Restricting namespace to: %s.", strna(s));
|
||||
}
|
||||
|
||||
/* NOOP? */
|
||||
if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
|
||||
return 0;
|
||||
|
||||
r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
if ((retain & NAMESPACE_FLAGS_ALL) == 0)
|
||||
/* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
|
||||
* altogether. */
|
||||
r = seccomp_rule_add(
|
||||
seccomp,
|
||||
SCMP_ACT_ERRNO(EPERM),
|
||||
SCMP_SYS(setns),
|
||||
0);
|
||||
else
|
||||
/* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
|
||||
* special invocation with a zero flags argument, right here. */
|
||||
r = seccomp_rule_add(
|
||||
seccomp,
|
||||
SCMP_ACT_ERRNO(EPERM),
|
||||
SCMP_SYS(setns),
|
||||
1,
|
||||
SCMP_A1(SCMP_CMP_EQ, 0));
|
||||
if (r < 0)
|
||||
goto finish;
|
||||
|
||||
for (i = 0; namespace_flag_map[i].name; i++) {
|
||||
unsigned long f;
|
||||
|
||||
f = namespace_flag_map[i].flag;
|
||||
if ((retain & f) == f) {
|
||||
log_debug("Permitting %s.", namespace_flag_map[i].name);
|
||||
continue;
|
||||
}
|
||||
|
||||
log_debug("Blocking %s.", namespace_flag_map[i].name);
|
||||
|
||||
r = seccomp_rule_add(
|
||||
seccomp,
|
||||
SCMP_ACT_ERRNO(EPERM),
|
||||
SCMP_SYS(unshare),
|
||||
1,
|
||||
SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
|
||||
if (r < 0)
|
||||
goto finish;
|
||||
|
||||
r = seccomp_rule_add(
|
||||
seccomp,
|
||||
SCMP_ACT_ERRNO(EPERM),
|
||||
SCMP_SYS(clone),
|
||||
1,
|
||||
SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
|
||||
if (r < 0)
|
||||
goto finish;
|
||||
|
||||
if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
|
||||
r = seccomp_rule_add(
|
||||
seccomp,
|
||||
SCMP_ACT_ERRNO(EPERM),
|
||||
SCMP_SYS(setns),
|
||||
1,
|
||||
SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
|
||||
if (r < 0)
|
||||
goto finish;
|
||||
}
|
||||
}
|
||||
|
||||
r = seccomp_load(seccomp);
|
||||
|
||||
finish:
|
||||
seccomp_release(seccomp);
|
||||
return r;
|
||||
}
|
||||
|
|
|
@ -66,3 +66,5 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name);
|
|||
int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action);
|
||||
|
||||
int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
|
||||
|
||||
int seccomp_restrict_namespaces(unsigned long retain);
|
||||
|
|
|
@ -20,10 +20,15 @@
|
|||
#include <stdlib.h>
|
||||
#include <sys/eventfd.h>
|
||||
#include <unistd.h>
|
||||
#include <sched.h>
|
||||
|
||||
#include "alloc-util.h"
|
||||
#include "fd-util.h"
|
||||
#include "macro.h"
|
||||
#include "missing.h"
|
||||
#include "nsflags.h"
|
||||
#include "process-util.h"
|
||||
#include "raw-clone.h"
|
||||
#include "seccomp-util.h"
|
||||
#include "string-util.h"
|
||||
#include "util.h"
|
||||
|
@ -125,12 +130,101 @@ static void test_filter_sets(void) {
|
|||
}
|
||||
}
|
||||
|
||||
static void test_restrict_namespace(void) {
|
||||
_cleanup_free_ char *s = NULL;
|
||||
pid_t pid;
|
||||
unsigned long ul;
|
||||
|
||||
assert_se(namespace_flag_to_string(0) == NULL);
|
||||
assert_se(streq(namespace_flag_to_string(CLONE_NEWNS), "mnt"));
|
||||
assert_se(namespace_flag_to_string(CLONE_NEWNS|CLONE_NEWIPC) == NULL);
|
||||
assert_se(streq(namespace_flag_to_string(CLONE_NEWCGROUP), "cgroup"));
|
||||
|
||||
assert_se(namespace_flag_from_string("mnt") == CLONE_NEWNS);
|
||||
assert_se(namespace_flag_from_string(NULL) == 0);
|
||||
assert_se(namespace_flag_from_string("") == 0);
|
||||
assert_se(namespace_flag_from_string("uts") == CLONE_NEWUTS);
|
||||
assert_se(namespace_flag_from_string(namespace_flag_to_string(CLONE_NEWUTS)) == CLONE_NEWUTS);
|
||||
assert_se(streq(namespace_flag_to_string(namespace_flag_from_string("ipc")), "ipc"));
|
||||
|
||||
assert_se(namespace_flag_from_string_many(NULL, &ul) == 0 && ul == 0);
|
||||
assert_se(namespace_flag_from_string_many("", &ul) == 0 && ul == 0);
|
||||
assert_se(namespace_flag_from_string_many("mnt uts ipc", &ul) == 0 && ul == (CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWIPC));
|
||||
|
||||
assert_se(namespace_flag_to_string_many(NAMESPACE_FLAGS_ALL, &s) == 0);
|
||||
assert_se(streq(s, "cgroup ipc net mnt pid user uts"));
|
||||
assert_se(namespace_flag_from_string_many(s, &ul) == 0 && ul == NAMESPACE_FLAGS_ALL);
|
||||
|
||||
if (!is_seccomp_available())
|
||||
return;
|
||||
|
||||
if (geteuid() != 0)
|
||||
return;
|
||||
|
||||
pid = fork();
|
||||
assert_se(pid >= 0);
|
||||
|
||||
if (pid == 0) {
|
||||
|
||||
assert_se(seccomp_restrict_namespaces(CLONE_NEWNS|CLONE_NEWNET) >= 0);
|
||||
|
||||
assert_se(unshare(CLONE_NEWNS) == 0);
|
||||
assert_se(unshare(CLONE_NEWNET) == 0);
|
||||
assert_se(unshare(CLONE_NEWUTS) == -1);
|
||||
assert_se(errno == EPERM);
|
||||
assert_se(unshare(CLONE_NEWIPC) == -1);
|
||||
assert_se(errno == EPERM);
|
||||
assert_se(unshare(CLONE_NEWNET|CLONE_NEWUTS) == -1);
|
||||
assert_se(errno == EPERM);
|
||||
|
||||
/* We use fd 0 (stdin) here, which of course will fail with EINVAL on setns(). Except of course our
|
||||
* seccomp filter worked, and hits first and makes it return EPERM */
|
||||
assert_se(setns(0, CLONE_NEWNS) == -1);
|
||||
assert_se(errno == EINVAL);
|
||||
assert_se(setns(0, CLONE_NEWNET) == -1);
|
||||
assert_se(errno == EINVAL);
|
||||
assert_se(setns(0, CLONE_NEWUTS) == -1);
|
||||
assert_se(errno == EPERM);
|
||||
assert_se(setns(0, CLONE_NEWIPC) == -1);
|
||||
assert_se(errno == EPERM);
|
||||
assert_se(setns(0, CLONE_NEWNET|CLONE_NEWUTS) == -1);
|
||||
assert_se(errno == EPERM);
|
||||
assert_se(setns(0, 0) == -1);
|
||||
assert_se(errno == EPERM);
|
||||
|
||||
pid = raw_clone(CLONE_NEWNS);
|
||||
assert_se(pid >= 0);
|
||||
if (pid == 0)
|
||||
_exit(EXIT_SUCCESS);
|
||||
pid = raw_clone(CLONE_NEWNET);
|
||||
assert_se(pid >= 0);
|
||||
if (pid == 0)
|
||||
_exit(EXIT_SUCCESS);
|
||||
pid = raw_clone(CLONE_NEWUTS);
|
||||
assert_se(pid < 0);
|
||||
assert_se(errno == EPERM);
|
||||
pid = raw_clone(CLONE_NEWIPC);
|
||||
assert_se(pid < 0);
|
||||
assert_se(errno == EPERM);
|
||||
pid = raw_clone(CLONE_NEWNET|CLONE_NEWUTS);
|
||||
assert_se(pid < 0);
|
||||
assert_se(errno == EPERM);
|
||||
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
assert_se(wait_for_terminate_and_warn("nsseccomp", pid, true) == EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
log_set_max_level(LOG_DEBUG);
|
||||
|
||||
test_seccomp_arch_to_string();
|
||||
test_architecture_table();
|
||||
test_syscall_filter_set_find();
|
||||
test_filter_sets();
|
||||
test_restrict_namespace();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue