2017-11-18 17:09:20 +01:00
|
|
|
/* SPDX-License-Identifier: LGPL-2.1+ */
|
2010-02-03 13:03:47 +01:00
|
|
|
|
2015-11-30 21:43:37 +01:00
|
|
|
#include <alloca.h>
|
2009-11-18 00:42:52 +01:00
|
|
|
#include <errno.h>
|
2015-09-19 00:53:58 +02:00
|
|
|
#include <fcntl.h>
|
|
|
|
#include <sched.h>
|
|
|
|
#include <signal.h>
|
|
|
|
#include <stdarg.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
2011-10-07 21:06:39 +02:00
|
|
|
#include <sys/mman.h>
|
2015-09-19 00:53:58 +02:00
|
|
|
#include <sys/prctl.h>
|
2015-11-30 21:43:37 +01:00
|
|
|
#include <sys/statfs.h>
|
|
|
|
#include <sys/sysmacros.h>
|
2015-09-19 00:53:58 +02:00
|
|
|
#include <sys/types.h>
|
|
|
|
#include <unistd.h>
|
2015-02-11 18:50:38 +01:00
|
|
|
|
2015-10-27 03:01:06 +01:00
|
|
|
#include "alloc-util.h"
|
2017-09-13 11:47:15 +02:00
|
|
|
#include "btrfs-util.h"
|
2015-09-23 03:01:06 +02:00
|
|
|
#include "build.h"
|
2016-06-08 18:56:20 +02:00
|
|
|
#include "cgroup-util.h"
|
2015-09-19 00:53:58 +02:00
|
|
|
#include "def.h"
|
2017-10-31 09:37:15 +01:00
|
|
|
#include "device-nodes.h"
|
2015-11-16 22:09:36 +01:00
|
|
|
#include "dirent-util.h"
|
2015-10-25 13:14:12 +01:00
|
|
|
#include "fd-util.h"
|
2015-09-19 00:53:58 +02:00
|
|
|
#include "fileio.h"
|
2016-11-07 16:14:59 +01:00
|
|
|
#include "format-util.h"
|
2015-09-19 00:53:58 +02:00
|
|
|
#include "hashmap.h"
|
|
|
|
#include "hostname-util.h"
|
2010-02-12 02:01:14 +01:00
|
|
|
#include "log.h"
|
2015-09-19 00:53:58 +02:00
|
|
|
#include "macro.h"
|
|
|
|
#include "missing.h"
|
2015-10-26 16:18:16 +01:00
|
|
|
#include "parse-util.h"
|
2012-05-07 21:36:12 +02:00
|
|
|
#include "path-util.h"
|
2015-04-10 19:10:00 +02:00
|
|
|
#include "process-util.h"
|
2018-01-17 15:35:01 +01:00
|
|
|
#include "procfs-util.h"
|
2015-11-30 21:43:37 +01:00
|
|
|
#include "set.h"
|
2015-12-01 23:22:03 +01:00
|
|
|
#include "signal-util.h"
|
2015-11-16 22:09:36 +01:00
|
|
|
#include "stat-util.h"
|
2015-10-24 22:58:24 +02:00
|
|
|
#include "string-util.h"
|
2015-09-19 00:53:58 +02:00
|
|
|
#include "strv.h"
|
2015-12-01 23:22:03 +01:00
|
|
|
#include "time-util.h"
|
2016-04-07 16:15:26 +02:00
|
|
|
#include "umask-util.h"
|
2015-10-25 22:32:30 +01:00
|
|
|
#include "user-util.h"
|
2015-10-23 18:52:53 +02:00
|
|
|
#include "util.h"
|
2018-01-10 10:36:14 +01:00
|
|
|
#include "virt.h"
|
Systemd is causing mislabeled devices to be created and then attempting to read them.
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
On 07/28/2010 05:57 AM, Kay Sievers wrote:
> On Wed, Jul 28, 2010 at 11:43, Lennart Poettering
> <lennart@poettering.net> wrote:
>> On Mon, 26.07.10 16:42, Daniel J Walsh (dwalsh@redhat.com) wrote:
>>> tcontext=system_u:object_r:device_t:s0 tclass=chr_file
>>> type=1400 audit(1280174589.476:7): avc: denied { read } for pid=1
>>> comm="systemd" name="autofs" dev=devtmpfs ino=9482
>>> scontext=system_u:system_r:init_t:s0
>>> tcontext=system_u:object_r:device_t:s0 tclass=chr_file
>>> type=1400 audit(1280174589.476:8): avc: denied { read } for pid=1
>>> comm="systemd" name="autofs" dev=devtmpfs ino=9482
>>> scontext=system_u:system_r:init_t:s0
>>> tcontext=system_u:object_r:device_t:s0 tclass=chr_file
>>>
>>> Lennart, we talked about this earlier. I think this is caused by the
>>> modprobe calls to create /dev/autofs. Since udev is not created at the
>>> point that init loads the kernel modules, the devices get created with
>>> the wrong label. Once udev starts the labels get fixed.
>>>
>>> I can allow init_t to read device_t chr_files.
>>
>> Hmm, I think a cleaner fix would be to make systemd relabel this device
>> properly before accessing it? Given that this is only one device this
>> should not be a problem for us to maintain, I think? How would the
>> fixing of the label work? Would we have to spawn restorecon for this, or
>> can we actually do this in C without too much work?
>
> I guess we can just do what udev is doing, and call setfilecon(), with
> a context of an earlier matchpathcon().
>
> Kay
> _______________________________________________
> systemd-devel mailing list
> systemd-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/systemd-devel
Here is the updated patch with a fix for the labeling of /dev/autofs
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2.0.14 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org/
iEYEARECAAYFAkxQMyoACgkQrlYvE4MpobNviACfWgxsjW2xzz1qznFex8RVAQHf
gIEAmwRmRcLvGqYtwQaZ3WKIg8wmrwNk
=pC2e
2010-07-28 15:39:54 +02:00
|
|
|
|
2011-06-30 04:16:10 +02:00
|
|
|
int saved_argc = 0;
|
|
|
|
char **saved_argv = NULL;
|
2016-06-13 16:28:42 +02:00
|
|
|
static int saved_in_initrd = -1;
|
2012-09-24 14:43:07 +02:00
|
|
|
|
2011-03-18 03:03:41 +01:00
|
|
|
size_t page_size(void) {
|
2013-12-16 01:24:14 +01:00
|
|
|
static thread_local size_t pgsz = 0;
|
2011-03-18 03:03:41 +01:00
|
|
|
long r;
|
|
|
|
|
2011-10-07 21:06:39 +02:00
|
|
|
if (_likely_(pgsz > 0))
|
2011-03-18 03:03:41 +01:00
|
|
|
return pgsz;
|
|
|
|
|
2012-09-14 10:06:42 +02:00
|
|
|
r = sysconf(_SC_PAGESIZE);
|
|
|
|
assert(r > 0);
|
2011-03-18 03:03:41 +01:00
|
|
|
|
|
|
|
pgsz = (size_t) r;
|
|
|
|
return pgsz;
|
|
|
|
}
|
|
|
|
|
2015-03-16 18:29:26 +01:00
|
|
|
bool plymouth_running(void) {
|
|
|
|
return access("/run/plymouth/pid", F_OK) >= 0;
|
|
|
|
}
|
|
|
|
|
2011-06-27 22:44:12 +02:00
|
|
|
bool display_is_local(const char *display) {
|
|
|
|
assert(display);
|
|
|
|
|
|
|
|
return
|
|
|
|
display[0] == ':' &&
|
|
|
|
display[1] >= '0' &&
|
|
|
|
display[1] <= '9';
|
|
|
|
}
|
|
|
|
|
2011-08-22 14:58:50 +02:00
|
|
|
bool kexec_loaded(void) {
|
2017-10-15 23:00:54 +02:00
|
|
|
_cleanup_free_ char *s = NULL;
|
|
|
|
|
|
|
|
if (read_one_line_file("/sys/kernel/kexec_loaded", &s) < 0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return s[0] == '1';
|
2011-08-22 14:58:50 +02:00
|
|
|
}
|
2011-09-28 04:25:13 +02:00
|
|
|
|
2011-10-07 21:06:39 +02:00
|
|
|
int prot_from_flags(int flags) {
|
|
|
|
|
|
|
|
switch (flags & O_ACCMODE) {
|
|
|
|
|
|
|
|
case O_RDONLY:
|
|
|
|
return PROT_READ;
|
|
|
|
|
|
|
|
case O_WRONLY:
|
|
|
|
return PROT_WRITE;
|
|
|
|
|
|
|
|
case O_RDWR:
|
|
|
|
return PROT_READ|PROT_WRITE;
|
|
|
|
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2011-10-12 04:42:38 +02:00
|
|
|
}
|
2011-10-12 04:29:11 +02:00
|
|
|
|
2012-05-16 14:22:40 +02:00
|
|
|
bool in_initrd(void) {
|
2012-07-10 18:46:26 +02:00
|
|
|
struct statfs s;
|
2012-05-21 20:00:58 +02:00
|
|
|
|
2016-06-13 16:28:42 +02:00
|
|
|
if (saved_in_initrd >= 0)
|
|
|
|
return saved_in_initrd;
|
2012-07-10 18:46:26 +02:00
|
|
|
|
|
|
|
/* We make two checks here:
|
|
|
|
*
|
|
|
|
* 1. the flag file /etc/initrd-release must exist
|
|
|
|
* 2. the root file system must be a memory file system
|
|
|
|
*
|
|
|
|
* The second check is extra paranoia, since misdetecting an
|
2016-10-02 19:37:21 +02:00
|
|
|
* initrd can have bad consequences due the initrd
|
2012-07-10 18:46:26 +02:00
|
|
|
* emptying when transititioning to the main systemd.
|
|
|
|
*/
|
|
|
|
|
2016-06-13 16:28:42 +02:00
|
|
|
saved_in_initrd = access("/etc/initrd-release", F_OK) >= 0 &&
|
|
|
|
statfs("/", &s) >= 0 &&
|
|
|
|
is_temporary_fs(&s);
|
2012-05-16 14:22:40 +02:00
|
|
|
|
2016-06-13 16:28:42 +02:00
|
|
|
return saved_in_initrd;
|
|
|
|
}
|
|
|
|
|
|
|
|
void in_initrd_force(bool value) {
|
|
|
|
saved_in_initrd = value;
|
2012-05-16 14:22:40 +02:00
|
|
|
}
|
2012-05-30 15:01:51 +02:00
|
|
|
|
2012-10-22 14:31:46 +02:00
|
|
|
/* hey glibc, APIs with callbacks without a user pointer are so useless */
|
|
|
|
void *xbsearch_r(const void *key, const void *base, size_t nmemb, size_t size,
|
2012-10-25 21:40:01 +02:00
|
|
|
int (*compar) (const void *, const void *, void *), void *arg) {
|
2012-10-22 14:31:46 +02:00
|
|
|
size_t l, u, idx;
|
|
|
|
const void *p;
|
|
|
|
int comparison;
|
|
|
|
|
2018-03-27 14:56:04 +02:00
|
|
|
assert(!size_multiply_overflow(nmemb, size));
|
|
|
|
|
2012-10-22 14:31:46 +02:00
|
|
|
l = 0;
|
|
|
|
u = nmemb;
|
|
|
|
while (l < u) {
|
|
|
|
idx = (l + u) / 2;
|
2018-03-27 14:56:04 +02:00
|
|
|
p = (const uint8_t*) base + idx * size;
|
2012-10-22 14:31:46 +02:00
|
|
|
comparison = compar(key, p, arg);
|
|
|
|
if (comparison < 0)
|
|
|
|
u = idx;
|
|
|
|
else if (comparison > 0)
|
|
|
|
l = idx + 1;
|
|
|
|
else
|
|
|
|
return (void *)p;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
2012-11-02 17:27:15 +01:00
|
|
|
|
2012-12-25 16:29:51 +01:00
|
|
|
int on_ac_power(void) {
|
|
|
|
bool found_offline = false, found_online = false;
|
|
|
|
_cleanup_closedir_ DIR *d = NULL;
|
2016-12-09 10:04:30 +01:00
|
|
|
struct dirent *de;
|
2012-12-25 16:29:51 +01:00
|
|
|
|
|
|
|
d = opendir("/sys/class/power_supply");
|
|
|
|
if (!d)
|
2015-03-04 01:07:28 +01:00
|
|
|
return errno == ENOENT ? true : -errno;
|
2012-12-25 16:29:51 +01:00
|
|
|
|
2016-12-09 10:04:30 +01:00
|
|
|
FOREACH_DIRENT(de, d, return -errno) {
|
2012-12-25 16:29:51 +01:00
|
|
|
_cleanup_close_ int fd = -1, device = -1;
|
|
|
|
char contents[6];
|
|
|
|
ssize_t n;
|
|
|
|
|
|
|
|
device = openat(dirfd(d), de->d_name, O_DIRECTORY|O_RDONLY|O_CLOEXEC|O_NOCTTY);
|
|
|
|
if (device < 0) {
|
2017-09-29 00:37:23 +02:00
|
|
|
if (IN_SET(errno, ENOENT, ENOTDIR))
|
2012-12-25 16:29:51 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
fd = openat(device, "type", O_RDONLY|O_CLOEXEC|O_NOCTTY);
|
|
|
|
if (fd < 0) {
|
|
|
|
if (errno == ENOENT)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
n = read(fd, contents, sizeof(contents));
|
|
|
|
if (n < 0)
|
|
|
|
return -errno;
|
|
|
|
|
|
|
|
if (n != 6 || memcmp(contents, "Mains\n", 6))
|
|
|
|
continue;
|
|
|
|
|
2014-03-18 19:22:43 +01:00
|
|
|
safe_close(fd);
|
2012-12-25 16:29:51 +01:00
|
|
|
fd = openat(device, "online", O_RDONLY|O_CLOEXEC|O_NOCTTY);
|
|
|
|
if (fd < 0) {
|
|
|
|
if (errno == ENOENT)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
n = read(fd, contents, sizeof(contents));
|
|
|
|
if (n < 0)
|
|
|
|
return -errno;
|
|
|
|
|
|
|
|
if (n != 2 || contents[1] != '\n')
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
if (contents[0] == '1') {
|
|
|
|
found_online = true;
|
|
|
|
break;
|
|
|
|
} else if (contents[0] == '0')
|
|
|
|
found_offline = true;
|
|
|
|
else
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
return found_online || !found_offline;
|
|
|
|
}
|
2013-02-11 23:48:36 +01:00
|
|
|
|
2013-12-13 22:02:47 +01:00
|
|
|
int container_get_leader(const char *machine, pid_t *pid) {
|
|
|
|
_cleanup_free_ char *s = NULL, *class = NULL;
|
|
|
|
const char *p;
|
|
|
|
pid_t leader;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(machine);
|
|
|
|
assert(pid);
|
|
|
|
|
2015-08-23 14:33:50 +02:00
|
|
|
if (!machine_name_is_valid(machine))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2015-02-03 02:05:59 +01:00
|
|
|
p = strjoina("/run/systemd/machines/", machine);
|
2018-03-23 21:31:14 +01:00
|
|
|
r = parse_env_file(NULL, p, NEWLINE, "LEADER", &s, "CLASS", &class, NULL);
|
2013-12-13 22:02:47 +01:00
|
|
|
if (r == -ENOENT)
|
|
|
|
return -EHOSTDOWN;
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
if (!s)
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
if (!streq_ptr(class, "container"))
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
r = parse_pid(s, &leader);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
if (leader <= 1)
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
*pid = leader;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
namespace helpers: Allow entering a UID namespace
To be able to use `systemd-run` or `machinectl login` on a container
that is in a private user namespace, the sub-process must have entered
the user namespace before connecting to the container's D-Bus, otherwise
the UID and GID in the peer credentials are garbage.
So we extend namespace_open and namespace_enter to support UID namespaces,
and we enter the UID namespace in bus_container_connect_{socket,kernel}.
namespace_open will degrade to a no-op if user namespaces are not enabled
in the kernel.
Special handling is required for the setns call in namespace_enter with
a user namespace, since transitioning to your own namespace is forbidden,
as it would result in re-entering your user namespace as root.
Arguably it may be valid to check this at the call site, rather than
inside namespace_enter, but it is less code to do it inside, and if the
intention of calling namespace_enter is to *be* in the target namespace,
rather than to transition to the target namespace, it is a reasonable
approach.
The check for whether the user namespace is the same must happen before
entering namespaces, as we may not be able to access /proc during the
intermediate transition stage.
We can't instead attempt to enter the user namespace and then ignore
the failure from it being the same namespace, since the error code is
not distinct, and we can't compare namespaces while mid-transition.
2015-08-17 10:52:13 +02:00
|
|
|
int namespace_open(pid_t pid, int *pidns_fd, int *mntns_fd, int *netns_fd, int *userns_fd, int *root_fd) {
|
|
|
|
_cleanup_close_ int pidnsfd = -1, mntnsfd = -1, netnsfd = -1, usernsfd = -1;
|
2014-05-21 10:44:45 +02:00
|
|
|
int rfd = -1;
|
2013-12-13 22:02:47 +01:00
|
|
|
|
|
|
|
assert(pid >= 0);
|
|
|
|
|
2014-05-18 13:48:53 +02:00
|
|
|
if (mntns_fd) {
|
|
|
|
const char *mntns;
|
2013-12-17 01:03:09 +01:00
|
|
|
|
2014-05-18 13:48:53 +02:00
|
|
|
mntns = procfs_file_alloca(pid, "ns/mnt");
|
|
|
|
mntnsfd = open(mntns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
|
|
|
|
if (mntnsfd < 0)
|
|
|
|
return -errno;
|
|
|
|
}
|
2013-12-13 22:02:47 +01:00
|
|
|
|
2014-05-18 13:48:53 +02:00
|
|
|
if (pidns_fd) {
|
|
|
|
const char *pidns;
|
|
|
|
|
|
|
|
pidns = procfs_file_alloca(pid, "ns/pid");
|
|
|
|
pidnsfd = open(pidns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
|
|
|
|
if (pidnsfd < 0)
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (netns_fd) {
|
|
|
|
const char *netns;
|
|
|
|
|
|
|
|
netns = procfs_file_alloca(pid, "ns/net");
|
|
|
|
netnsfd = open(netns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
|
|
|
|
if (netnsfd < 0)
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
namespace helpers: Allow entering a UID namespace
To be able to use `systemd-run` or `machinectl login` on a container
that is in a private user namespace, the sub-process must have entered
the user namespace before connecting to the container's D-Bus, otherwise
the UID and GID in the peer credentials are garbage.
So we extend namespace_open and namespace_enter to support UID namespaces,
and we enter the UID namespace in bus_container_connect_{socket,kernel}.
namespace_open will degrade to a no-op if user namespaces are not enabled
in the kernel.
Special handling is required for the setns call in namespace_enter with
a user namespace, since transitioning to your own namespace is forbidden,
as it would result in re-entering your user namespace as root.
Arguably it may be valid to check this at the call site, rather than
inside namespace_enter, but it is less code to do it inside, and if the
intention of calling namespace_enter is to *be* in the target namespace,
rather than to transition to the target namespace, it is a reasonable
approach.
The check for whether the user namespace is the same must happen before
entering namespaces, as we may not be able to access /proc during the
intermediate transition stage.
We can't instead attempt to enter the user namespace and then ignore
the failure from it being the same namespace, since the error code is
not distinct, and we can't compare namespaces while mid-transition.
2015-08-17 10:52:13 +02:00
|
|
|
if (userns_fd) {
|
|
|
|
const char *userns;
|
|
|
|
|
|
|
|
userns = procfs_file_alloca(pid, "ns/user");
|
|
|
|
usernsfd = open(userns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
|
|
|
|
if (usernsfd < 0 && errno != ENOENT)
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
2014-05-18 13:48:53 +02:00
|
|
|
if (root_fd) {
|
|
|
|
const char *root;
|
|
|
|
|
|
|
|
root = procfs_file_alloca(pid, "root");
|
|
|
|
rfd = open(root, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY);
|
|
|
|
if (rfd < 0)
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pidns_fd)
|
|
|
|
*pidns_fd = pidnsfd;
|
2013-12-13 22:02:47 +01:00
|
|
|
|
2014-05-18 13:48:53 +02:00
|
|
|
if (mntns_fd)
|
|
|
|
*mntns_fd = mntnsfd;
|
|
|
|
|
|
|
|
if (netns_fd)
|
|
|
|
*netns_fd = netnsfd;
|
|
|
|
|
namespace helpers: Allow entering a UID namespace
To be able to use `systemd-run` or `machinectl login` on a container
that is in a private user namespace, the sub-process must have entered
the user namespace before connecting to the container's D-Bus, otherwise
the UID and GID in the peer credentials are garbage.
So we extend namespace_open and namespace_enter to support UID namespaces,
and we enter the UID namespace in bus_container_connect_{socket,kernel}.
namespace_open will degrade to a no-op if user namespaces are not enabled
in the kernel.
Special handling is required for the setns call in namespace_enter with
a user namespace, since transitioning to your own namespace is forbidden,
as it would result in re-entering your user namespace as root.
Arguably it may be valid to check this at the call site, rather than
inside namespace_enter, but it is less code to do it inside, and if the
intention of calling namespace_enter is to *be* in the target namespace,
rather than to transition to the target namespace, it is a reasonable
approach.
The check for whether the user namespace is the same must happen before
entering namespaces, as we may not be able to access /proc during the
intermediate transition stage.
We can't instead attempt to enter the user namespace and then ignore
the failure from it being the same namespace, since the error code is
not distinct, and we can't compare namespaces while mid-transition.
2015-08-17 10:52:13 +02:00
|
|
|
if (userns_fd)
|
|
|
|
*userns_fd = usernsfd;
|
|
|
|
|
2014-05-18 13:48:53 +02:00
|
|
|
if (root_fd)
|
|
|
|
*root_fd = rfd;
|
|
|
|
|
namespace helpers: Allow entering a UID namespace
To be able to use `systemd-run` or `machinectl login` on a container
that is in a private user namespace, the sub-process must have entered
the user namespace before connecting to the container's D-Bus, otherwise
the UID and GID in the peer credentials are garbage.
So we extend namespace_open and namespace_enter to support UID namespaces,
and we enter the UID namespace in bus_container_connect_{socket,kernel}.
namespace_open will degrade to a no-op if user namespaces are not enabled
in the kernel.
Special handling is required for the setns call in namespace_enter with
a user namespace, since transitioning to your own namespace is forbidden,
as it would result in re-entering your user namespace as root.
Arguably it may be valid to check this at the call site, rather than
inside namespace_enter, but it is less code to do it inside, and if the
intention of calling namespace_enter is to *be* in the target namespace,
rather than to transition to the target namespace, it is a reasonable
approach.
The check for whether the user namespace is the same must happen before
entering namespaces, as we may not be able to access /proc during the
intermediate transition stage.
We can't instead attempt to enter the user namespace and then ignore
the failure from it being the same namespace, since the error code is
not distinct, and we can't compare namespaces while mid-transition.
2015-08-17 10:52:13 +02:00
|
|
|
pidnsfd = mntnsfd = netnsfd = usernsfd = -1;
|
2013-12-13 22:02:47 +01:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
namespace helpers: Allow entering a UID namespace
To be able to use `systemd-run` or `machinectl login` on a container
that is in a private user namespace, the sub-process must have entered
the user namespace before connecting to the container's D-Bus, otherwise
the UID and GID in the peer credentials are garbage.
So we extend namespace_open and namespace_enter to support UID namespaces,
and we enter the UID namespace in bus_container_connect_{socket,kernel}.
namespace_open will degrade to a no-op if user namespaces are not enabled
in the kernel.
Special handling is required for the setns call in namespace_enter with
a user namespace, since transitioning to your own namespace is forbidden,
as it would result in re-entering your user namespace as root.
Arguably it may be valid to check this at the call site, rather than
inside namespace_enter, but it is less code to do it inside, and if the
intention of calling namespace_enter is to *be* in the target namespace,
rather than to transition to the target namespace, it is a reasonable
approach.
The check for whether the user namespace is the same must happen before
entering namespaces, as we may not be able to access /proc during the
intermediate transition stage.
We can't instead attempt to enter the user namespace and then ignore
the failure from it being the same namespace, since the error code is
not distinct, and we can't compare namespaces while mid-transition.
2015-08-17 10:52:13 +02:00
|
|
|
int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd) {
|
|
|
|
if (userns_fd >= 0) {
|
|
|
|
/* Can't setns to your own userns, since then you could
|
|
|
|
* escalate from non-root to root in your own namespace, so
|
|
|
|
* check if namespaces equal before attempting to enter. */
|
|
|
|
_cleanup_free_ char *userns_fd_path = NULL;
|
|
|
|
int r;
|
|
|
|
if (asprintf(&userns_fd_path, "/proc/self/fd/%d", userns_fd) < 0)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2017-06-17 18:37:16 +02:00
|
|
|
r = files_same(userns_fd_path, "/proc/self/ns/user", 0);
|
namespace helpers: Allow entering a UID namespace
To be able to use `systemd-run` or `machinectl login` on a container
that is in a private user namespace, the sub-process must have entered
the user namespace before connecting to the container's D-Bus, otherwise
the UID and GID in the peer credentials are garbage.
So we extend namespace_open and namespace_enter to support UID namespaces,
and we enter the UID namespace in bus_container_connect_{socket,kernel}.
namespace_open will degrade to a no-op if user namespaces are not enabled
in the kernel.
Special handling is required for the setns call in namespace_enter with
a user namespace, since transitioning to your own namespace is forbidden,
as it would result in re-entering your user namespace as root.
Arguably it may be valid to check this at the call site, rather than
inside namespace_enter, but it is less code to do it inside, and if the
intention of calling namespace_enter is to *be* in the target namespace,
rather than to transition to the target namespace, it is a reasonable
approach.
The check for whether the user namespace is the same must happen before
entering namespaces, as we may not be able to access /proc during the
intermediate transition stage.
We can't instead attempt to enter the user namespace and then ignore
the failure from it being the same namespace, since the error code is
not distinct, and we can't compare namespaces while mid-transition.
2015-08-17 10:52:13 +02:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
if (r)
|
|
|
|
userns_fd = -1;
|
|
|
|
}
|
2013-12-13 22:02:47 +01:00
|
|
|
|
2014-05-18 13:48:53 +02:00
|
|
|
if (pidns_fd >= 0)
|
|
|
|
if (setns(pidns_fd, CLONE_NEWPID) < 0)
|
|
|
|
return -errno;
|
2013-12-17 01:03:09 +01:00
|
|
|
|
2014-05-18 13:48:53 +02:00
|
|
|
if (mntns_fd >= 0)
|
|
|
|
if (setns(mntns_fd, CLONE_NEWNS) < 0)
|
|
|
|
return -errno;
|
2013-12-13 22:02:47 +01:00
|
|
|
|
2014-05-18 13:48:53 +02:00
|
|
|
if (netns_fd >= 0)
|
|
|
|
if (setns(netns_fd, CLONE_NEWNET) < 0)
|
|
|
|
return -errno;
|
2013-12-13 22:02:47 +01:00
|
|
|
|
namespace helpers: Allow entering a UID namespace
To be able to use `systemd-run` or `machinectl login` on a container
that is in a private user namespace, the sub-process must have entered
the user namespace before connecting to the container's D-Bus, otherwise
the UID and GID in the peer credentials are garbage.
So we extend namespace_open and namespace_enter to support UID namespaces,
and we enter the UID namespace in bus_container_connect_{socket,kernel}.
namespace_open will degrade to a no-op if user namespaces are not enabled
in the kernel.
Special handling is required for the setns call in namespace_enter with
a user namespace, since transitioning to your own namespace is forbidden,
as it would result in re-entering your user namespace as root.
Arguably it may be valid to check this at the call site, rather than
inside namespace_enter, but it is less code to do it inside, and if the
intention of calling namespace_enter is to *be* in the target namespace,
rather than to transition to the target namespace, it is a reasonable
approach.
The check for whether the user namespace is the same must happen before
entering namespaces, as we may not be able to access /proc during the
intermediate transition stage.
We can't instead attempt to enter the user namespace and then ignore
the failure from it being the same namespace, since the error code is
not distinct, and we can't compare namespaces while mid-transition.
2015-08-17 10:52:13 +02:00
|
|
|
if (userns_fd >= 0)
|
|
|
|
if (setns(userns_fd, CLONE_NEWUSER) < 0)
|
|
|
|
return -errno;
|
|
|
|
|
2014-05-18 13:48:53 +02:00
|
|
|
if (root_fd >= 0) {
|
|
|
|
if (fchdir(root_fd) < 0)
|
|
|
|
return -errno;
|
|
|
|
|
|
|
|
if (chroot(".") < 0)
|
|
|
|
return -errno;
|
|
|
|
}
|
2013-12-13 22:02:47 +01:00
|
|
|
|
2015-05-20 14:41:39 +02:00
|
|
|
return reset_uid_gid();
|
2013-12-13 22:02:47 +01:00
|
|
|
}
|
2013-12-18 04:19:20 +01:00
|
|
|
|
2014-03-04 19:20:21 +01:00
|
|
|
uint64_t physical_memory(void) {
|
2016-06-08 18:56:20 +02:00
|
|
|
_cleanup_free_ char *root = NULL, *value = NULL;
|
|
|
|
uint64_t mem, lim;
|
|
|
|
size_t ps;
|
|
|
|
long sc;
|
2018-05-17 04:27:58 +02:00
|
|
|
int r;
|
2014-03-04 19:20:21 +01:00
|
|
|
|
2016-06-08 18:56:20 +02:00
|
|
|
/* We return this as uint64_t in case we are running as 32bit process on a 64bit kernel with huge amounts of
|
|
|
|
* memory.
|
|
|
|
*
|
|
|
|
* In order to support containers nicely that have a configured memory limit we'll take the minimum of the
|
|
|
|
* physically reported amount of memory and the limit configured for the root cgroup, if there is any. */
|
|
|
|
|
|
|
|
sc = sysconf(_SC_PHYS_PAGES);
|
|
|
|
assert(sc > 0);
|
|
|
|
|
|
|
|
ps = page_size();
|
|
|
|
mem = (uint64_t) sc * (uint64_t) ps;
|
|
|
|
|
2018-05-17 04:27:58 +02:00
|
|
|
r = cg_get_root_path(&root);
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to determine root cgroup, ignoring cgroup memory limit: %m");
|
2016-06-08 18:56:20 +02:00
|
|
|
return mem;
|
2018-05-17 04:27:58 +02:00
|
|
|
}
|
2016-06-08 18:56:20 +02:00
|
|
|
|
2018-05-17 04:27:58 +02:00
|
|
|
r = cg_all_unified();
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to determine root unified mode, ignoring cgroup memory limit: %m");
|
2016-06-08 18:56:20 +02:00
|
|
|
return mem;
|
2018-05-17 04:27:58 +02:00
|
|
|
}
|
|
|
|
if (r > 0) {
|
|
|
|
r = cg_get_attribute("memory", root, "memory.max", &value);
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to read memory.max cgroup attribute, ignoring cgroup memory limit: %m");
|
|
|
|
return mem;
|
|
|
|
}
|
2016-06-08 18:56:20 +02:00
|
|
|
|
2018-05-17 04:27:58 +02:00
|
|
|
if (streq(value, "max"))
|
|
|
|
return mem;
|
|
|
|
} else {
|
|
|
|
r = cg_get_attribute("memory", root, "memory.limit_in_bytes", &value);
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to read memory.limit_in_bytes cgroup attribute, ignoring cgroup memory limit: %m");
|
|
|
|
return mem;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
r = safe_atou64(value, &lim);
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to parse cgroup memory limit '%s', ignoring: %m", value);
|
|
|
|
return mem;
|
|
|
|
}
|
|
|
|
if (lim == UINT64_MAX)
|
2016-06-08 18:56:20 +02:00
|
|
|
return mem;
|
2014-03-04 19:20:21 +01:00
|
|
|
|
2016-06-08 18:56:20 +02:00
|
|
|
/* Make sure the limit is a multiple of our own page size */
|
|
|
|
lim /= ps;
|
|
|
|
lim *= ps;
|
2014-03-04 19:20:21 +01:00
|
|
|
|
2016-06-08 18:56:20 +02:00
|
|
|
return MIN(mem, lim);
|
2014-03-04 19:20:21 +01:00
|
|
|
}
|
2014-03-06 21:14:26 +01:00
|
|
|
|
2016-06-08 20:45:32 +02:00
|
|
|
uint64_t physical_memory_scale(uint64_t v, uint64_t max) {
|
|
|
|
uint64_t p, m, ps, r;
|
|
|
|
|
|
|
|
assert(max > 0);
|
|
|
|
|
|
|
|
/* Returns the physical memory size, multiplied by v divided by max. Returns UINT64_MAX on overflow. On success
|
|
|
|
* the result is a multiple of the page size (rounds down). */
|
|
|
|
|
|
|
|
ps = page_size();
|
|
|
|
assert(ps > 0);
|
|
|
|
|
|
|
|
p = physical_memory() / ps;
|
|
|
|
assert(p > 0);
|
|
|
|
|
|
|
|
m = p * v;
|
|
|
|
if (m / p != v)
|
|
|
|
return UINT64_MAX;
|
|
|
|
|
|
|
|
m /= max;
|
|
|
|
|
|
|
|
r = m * ps;
|
|
|
|
if (r / ps != m)
|
|
|
|
return UINT64_MAX;
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2016-07-19 15:58:49 +02:00
|
|
|
uint64_t system_tasks_max(void) {
|
|
|
|
|
|
|
|
uint64_t a = TASKS_MAX, b = TASKS_MAX;
|
2018-01-17 15:35:01 +01:00
|
|
|
_cleanup_free_ char *root = NULL;
|
2018-05-17 04:32:15 +02:00
|
|
|
int r;
|
2016-07-19 15:58:49 +02:00
|
|
|
|
|
|
|
/* Determine the maximum number of tasks that may run on this system. We check three sources to determine this
|
|
|
|
* limit:
|
|
|
|
*
|
2018-01-17 15:31:23 +01:00
|
|
|
* a) the maximum tasks value the kernel allows on this architecture
|
2016-07-19 15:58:49 +02:00
|
|
|
* b) the cgroups pids_max attribute for the system
|
2018-01-17 15:31:23 +01:00
|
|
|
* c) the kernel's configured maximum PID value
|
2016-07-19 15:58:49 +02:00
|
|
|
*
|
|
|
|
* And then pick the smallest of the three */
|
|
|
|
|
2018-05-17 04:32:15 +02:00
|
|
|
r = procfs_tasks_get_limit(&a);
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to read maximum number of tasks from /proc, ignoring: %m");
|
2016-07-19 15:58:49 +02:00
|
|
|
|
2018-05-17 04:32:15 +02:00
|
|
|
r = cg_get_root_path(&root);
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to determine cgroup root path, ignoring: %m");
|
|
|
|
else {
|
2018-01-17 15:35:01 +01:00
|
|
|
_cleanup_free_ char *value = NULL;
|
2016-07-19 15:58:49 +02:00
|
|
|
|
2018-05-17 04:32:15 +02:00
|
|
|
r = cg_get_attribute("pids", root, "pids.max", &value);
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to read pids.max attribute of cgroup root, ignoring: %m");
|
|
|
|
else if (!streq(value, "max")) {
|
|
|
|
r = safe_atou64(value, &b);
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to parse pids.max attribute of cgroup root, ignoring: %m");
|
|
|
|
}
|
2016-07-19 15:58:49 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return MIN3(TASKS_MAX,
|
|
|
|
a <= 0 ? TASKS_MAX : a,
|
|
|
|
b <= 0 ? TASKS_MAX : b);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t system_tasks_max_scale(uint64_t v, uint64_t max) {
|
|
|
|
uint64_t t, m;
|
|
|
|
|
|
|
|
assert(max > 0);
|
|
|
|
|
|
|
|
/* Multiply the system's task value by the fraction v/max. Hence, if max==100 this calculates percentages
|
|
|
|
* relative to the system's maximum number of tasks. Returns UINT64_MAX on overflow. */
|
|
|
|
|
|
|
|
t = system_tasks_max();
|
|
|
|
assert(t > 0);
|
|
|
|
|
|
|
|
m = t * v;
|
|
|
|
if (m / t != v) /* overflow? */
|
|
|
|
return UINT64_MAX;
|
|
|
|
|
|
|
|
return m / max;
|
|
|
|
}
|
|
|
|
|
2015-09-23 03:01:06 +02:00
|
|
|
int version(void) {
|
|
|
|
puts(PACKAGE_STRING "\n"
|
|
|
|
SYSTEMD_FEATURES);
|
|
|
|
return 0;
|
|
|
|
}
|
2017-12-23 15:02:58 +01:00
|
|
|
|
|
|
|
/* This is a direct translation of str_verscmp from boot.c */
|
|
|
|
static bool is_digit(int c) {
|
|
|
|
return c >= '0' && c <= '9';
|
|
|
|
}
|
|
|
|
|
|
|
|
static int c_order(int c) {
|
|
|
|
if (c == 0 || is_digit(c))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if ((c >= 'a') && (c <= 'z'))
|
|
|
|
return c;
|
|
|
|
|
|
|
|
return c + 0x10000;
|
|
|
|
}
|
|
|
|
|
|
|
|
int str_verscmp(const char *s1, const char *s2) {
|
|
|
|
const char *os1, *os2;
|
|
|
|
|
|
|
|
assert(s1);
|
|
|
|
assert(s2);
|
|
|
|
|
|
|
|
os1 = s1;
|
|
|
|
os2 = s2;
|
|
|
|
|
|
|
|
while (*s1 || *s2) {
|
|
|
|
int first;
|
|
|
|
|
|
|
|
while ((*s1 && !is_digit(*s1)) || (*s2 && !is_digit(*s2))) {
|
|
|
|
int order;
|
|
|
|
|
|
|
|
order = c_order(*s1) - c_order(*s2);
|
|
|
|
if (order != 0)
|
|
|
|
return order;
|
|
|
|
s1++;
|
|
|
|
s2++;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (*s1 == '0')
|
|
|
|
s1++;
|
|
|
|
while (*s2 == '0')
|
|
|
|
s2++;
|
|
|
|
|
|
|
|
first = 0;
|
|
|
|
while (is_digit(*s1) && is_digit(*s2)) {
|
|
|
|
if (first == 0)
|
|
|
|
first = *s1 - *s2;
|
|
|
|
s1++;
|
|
|
|
s2++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_digit(*s1))
|
|
|
|
return 1;
|
|
|
|
if (is_digit(*s2))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (first != 0)
|
|
|
|
return first;
|
|
|
|
}
|
|
|
|
|
|
|
|
return strcmp(os1, os2);
|
|
|
|
}
|
2018-01-10 10:36:14 +01:00
|
|
|
|
|
|
|
/* Turn off core dumps but only if we're running outside of a container. */
|
2018-01-10 18:37:54 +01:00
|
|
|
void disable_coredumps(void) {
|
|
|
|
int r;
|
|
|
|
|
|
|
|
if (detect_container() > 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
r = write_string_file("/proc/sys/kernel/core_pattern", "|/bin/false", 0);
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to turn off coredumps, ignoring: %m");
|
2018-01-10 10:36:14 +01:00
|
|
|
}
|