Systemd/src/journal/journald-native.c

508 lines
19 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: LGPL-2.1+ */
2012-08-24 01:46:38 +02:00
#include <stddef.h>
#include <sys/epoll.h>
#include <sys/mman.h>
#include <sys/statvfs.h>
#include <unistd.h>
#include "alloc-util.h"
#include "fd-util.h"
#include "fs-util.h"
#include "io-util.h"
#include "journal-importer.h"
#include "journal-util.h"
#include "journald-console.h"
#include "journald-kmsg.h"
#include "journald-native.h"
#include "journald-server.h"
#include "journald-syslog.h"
#include "journald-wall.h"
#include "memfd-util.h"
#include "memory-util.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
#include "selinux-util.h"
#include "socket-util.h"
#include "string-util.h"
#include "strv.h"
#include "unaligned.h"
2014-11-02 21:46:42 +01:00
static bool allow_object_pid(const struct ucred *ucred) {
return ucred && ucred->uid == 0;
}
static void server_process_entry_meta(
const char *p, size_t l,
const struct ucred *ucred,
int *priority,
char **identifier,
char **message,
pid_t *object_pid) {
/* We need to determine the priority of this entry for the rate limiting logic */
if (l == 10 &&
startswith(p, "PRIORITY=") &&
p[9] >= '0' && p[9] <= '9')
*priority = (*priority & LOG_FACMASK) | (p[9] - '0');
else if (l == 17 &&
startswith(p, "SYSLOG_FACILITY=") &&
p[16] >= '0' && p[16] <= '9')
*priority = (*priority & LOG_PRIMASK) | ((p[16] - '0') << 3);
else if (l == 18 &&
startswith(p, "SYSLOG_FACILITY=") &&
p[16] >= '0' && p[16] <= '9' &&
p[17] >= '0' && p[17] <= '9')
*priority = (*priority & LOG_PRIMASK) | (((p[16] - '0')*10 + (p[17] - '0')) << 3);
else if (l >= 19 &&
startswith(p, "SYSLOG_IDENTIFIER=")) {
char *t;
t = memdup_suffix0(p + 18, l - 18);
if (t) {
free(*identifier);
*identifier = t;
}
} else if (l >= 8 &&
startswith(p, "MESSAGE=")) {
char *t;
t = memdup_suffix0(p + 8, l - 8);
if (t) {
free(*message);
*message = t;
}
} else if (l > STRLEN("OBJECT_PID=") &&
l < STRLEN("OBJECT_PID=") + DECIMAL_STR_MAX(pid_t) &&
startswith(p, "OBJECT_PID=") &&
allow_object_pid(ucred)) {
char buf[DECIMAL_STR_MAX(pid_t)];
memcpy(buf, p + STRLEN("OBJECT_PID="),
l - STRLEN("OBJECT_PID="));
buf[l-STRLEN("OBJECT_PID=")] = '\0';
(void) parse_pid(buf, object_pid);
}
}
static int server_process_entry(
Server *s,
const void *buffer, size_t *remaining,
ClientContext *context,
2014-11-02 21:46:42 +01:00
const struct ucred *ucred,
const struct timeval *tv,
const char *label, size_t label_len) {
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
/* Process a single entry from a native message. Returns 0 if nothing special happened and the message
* processing should continue, and a negative or positive value otherwise.
*
* Note that *remaining is altered on both success and failure. */
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
size_t n = 0, j, tn = (size_t) -1, m = 0, entry_size = 0;
char *identifier = NULL, *message = NULL;
struct iovec *iovec = NULL;
int priority = LOG_INFO;
pid_t object_pid = 0;
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
const char *p;
int r = 1;
p = buffer;
while (*remaining > 0) {
const char *e, *q;
e = memchr(p, '\n', *remaining);
if (!e) {
/* Trailing noise, let's ignore it, and flush what we collected */
log_debug("Received message with trailing noise, ignoring.");
break; /* finish processing of the message */
}
if (e == p) {
/* Entry separator */
*remaining -= 1;
break;
}
2017-10-04 16:01:32 +02:00
if (IN_SET(*p, '.', '#')) {
/* Ignore control commands for now, and comments too. */
*remaining -= (e - p) + 1;
p = e + 1;
continue;
}
/* A property follows */
if (n > ENTRY_FIELD_COUNT_MAX) {
log_debug("Received an entry that has more than " STRINGIFY(ENTRY_FIELD_COUNT_MAX) " fields, ignoring entry.");
goto finish;
}
/* n existing properties, 1 new, +1 for _TRANSPORT */
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
if (!GREEDY_REALLOC(iovec, m,
n + 2 +
N_IOVEC_META_FIELDS + N_IOVEC_OBJECT_FIELDS +
client_context_extra_fields_n_iovec(context))) {
r = log_oom();
goto finish;
}
q = memchr(p, '=', e - p);
if (q) {
if (journal_field_valid(p, q - p, false)) {
size_t l;
l = e - p;
if (l > DATA_SIZE_MAX) {
log_debug("Received text block of %zu bytes is too large, ignoring entry.", l);
goto finish;
}
if (entry_size + l + n + 1 > ENTRY_SIZE_MAX) { /* data + separators + trailer */
log_debug("Entry is too big (%zu bytes after processing %zu entries), ignoring entry.",
entry_size + l, n + 1);
goto finish;
}
/* If the field name starts with an underscore, skip the variable, since that indicates
* a trusted field */
iovec[n++] = IOVEC_MAKE((char*) p, l);
entry_size += l;
server_process_entry_meta(p, l, ucred,
&priority,
&identifier,
&message,
&object_pid);
}
*remaining -= (e - p) + 1;
p = e + 1;
continue;
} else {
uint64_t l, total;
char *k;
if (*remaining < e - p + 1 + sizeof(uint64_t) + 1) {
log_debug("Failed to parse message, ignoring.");
break;
}
l = unaligned_read_le64(e + 1);
if (l > DATA_SIZE_MAX) {
log_debug("Received binary data block of %"PRIu64" bytes is too large, ignoring entry.", l);
goto finish;
}
total = (e - p) + 1 + l;
if (entry_size + total + n + 1 > ENTRY_SIZE_MAX) { /* data + separators + trailer */
log_debug("Entry is too big (%"PRIu64"bytes after processing %zu fields), ignoring.",
entry_size + total, n + 1);
goto finish;
}
if ((uint64_t) *remaining < e - p + 1 + sizeof(uint64_t) + l + 1 ||
e[1+sizeof(uint64_t)+l] != '\n') {
log_debug("Failed to parse message, ignoring.");
break;
}
k = malloc(total);
if (!k) {
log_oom();
break;
}
memcpy(k, p, e - p);
k[e - p] = '=';
memcpy(k + (e - p) + 1, e + 1 + sizeof(uint64_t), l);
if (journal_field_valid(p, e - p, false)) {
iovec[n] = IOVEC_MAKE(k, (e - p) + 1 + l);
entry_size += iovec[n].iov_len;
n++;
server_process_entry_meta(k, (e - p) + 1 + l, ucred,
&priority,
&identifier,
&message,
&object_pid);
} else
free(k);
*remaining -= (e - p) + 1 + sizeof(uint64_t) + l + 1;
p = e + 1 + sizeof(uint64_t) + l + 1;
}
}
if (n <= 0)
goto finish;
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
tn = n++;
iovec[tn] = IOVEC_MAKE_STRING("_TRANSPORT=journal");
entry_size += STRLEN("_TRANSPORT=journal");
if (entry_size + n + 1 > ENTRY_SIZE_MAX) { /* data + separators + trailer */
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
log_debug("Entry is too big with %zu properties and %zu bytes, ignoring.", n, entry_size);
goto finish;
}
r = 0; /* Success, we read the message. */
if (!client_context_test_priority(context, priority))
goto finish;
if (message) {
if (s->forward_to_syslog)
server_forward_syslog(s, syslog_fixup_facility(priority), identifier, message, ucred, tv);
if (s->forward_to_kmsg)
server_forward_kmsg(s, priority, identifier, message, ucred);
if (s->forward_to_console)
server_forward_console(s, priority, identifier, message, ucred);
if (s->forward_to_wall)
server_forward_wall(s, priority, identifier, message, ucred);
}
server_dispatch_message(s, iovec, n, m, context, tv, priority, object_pid);
finish:
for (j = 0; j < n; j++) {
if (j == tn)
continue;
if (iovec[j].iov_base < buffer ||
(const char*) iovec[j].iov_base >= p + *remaining)
free(iovec[j].iov_base);
}
free(iovec);
free(identifier);
free(message);
return r;
}
void server_process_native_message(
Server *s,
const char *buffer, size_t buffer_size,
const struct ucred *ucred,
const struct timeval *tv,
const char *label, size_t label_len) {
size_t remaining = buffer_size;
ClientContext *context = NULL;
int r;
assert(s);
assert(buffer || buffer_size == 0);
if (ucred && pid_is_valid(ucred->pid)) {
r = client_context_get(s, ucred->pid, ucred, label, label_len, NULL, &context);
if (r < 0)
log_warning_errno(r, "Failed to retrieve credentials for PID " PID_FMT ", ignoring: %m", ucred->pid);
}
do {
r = server_process_entry(s,
(const uint8_t*) buffer + (buffer_size - remaining), &remaining,
context, ucred, tv, label, label_len);
} while (r == 0);
}
void server_process_native_file(
Server *s,
int fd,
2014-11-02 21:46:42 +01:00
const struct ucred *ucred,
const struct timeval *tv,
const char *label, size_t label_len) {
struct stat st;
bool sealed;
int r;
/* Data is in the passed fd, probably it didn't fit in a datagram. */
assert(s);
assert(fd >= 0);
/* If it's a memfd, check if it is sealed. If so, we can just
* mmap it and use it, and do not need to copy the data out. */
sealed = memfd_get_sealed(fd) > 0;
if (!sealed && (!ucred || ucred->uid != 0)) {
_cleanup_free_ char *k = NULL;
const char *e;
/* If this is not a sealed memfd, and the peer is unknown or
* unprivileged, then verify the path. */
r = fd_get_path(fd, &k);
if (r < 0) {
log_error_errno(r, "readlink(/proc/self/fd/%i) failed: %m", fd);
return;
}
e = PATH_STARTSWITH_SET(k, "/dev/shm/", "/tmp/", "/var/tmp/");
if (!e) {
log_error("Received file outside of allowed directories. Refusing.");
return;
}
if (!filename_is_valid(e)) {
log_error("Received file in subdirectory of allowed directories. Refusing.");
return;
}
}
if (fstat(fd, &st) < 0) {
log_error_errno(errno, "Failed to stat passed file, ignoring: %m");
return;
}
if (!S_ISREG(st.st_mode)) {
log_error("File passed is not regular. Ignoring.");
return;
}
if (st.st_size <= 0)
return;
/* When !sealed, set a lower memory limit. We have to read the file,
* effectively doubling memory use. */
if (st.st_size > ENTRY_SIZE_MAX / (sealed ? 1 : 2)) {
log_error("File passed too large (%"PRIu64" bytes). Ignoring.", (uint64_t) st.st_size);
return;
}
if (sealed) {
void *p;
size_t ps;
/* The file is sealed, we can just map it and use it. */
ps = PAGE_ALIGN(st.st_size);
p = mmap(NULL, ps, PROT_READ, MAP_PRIVATE, fd, 0);
if (p == MAP_FAILED) {
log_error_errno(errno, "Failed to map memfd, ignoring: %m");
return;
}
server_process_native_message(s, p, st.st_size, ucred, tv, label, label_len);
assert_se(munmap(p, ps) >= 0);
} else {
_cleanup_free_ void *p = NULL;
struct statvfs vfs;
ssize_t n;
if (fstatvfs(fd, &vfs) < 0) {
log_error_errno(errno, "Failed to stat file system of passed file, not processing it: %m");
return;
}
/* Refuse operating on file systems that have
* mandatory locking enabled, see:
*
* https://github.com/systemd/systemd/issues/1822
*/
if (vfs.f_flag & ST_MANDLOCK) {
log_error("Received file descriptor from file system with mandatory locking enabled, not processing it.");
return;
}
/* Make the fd non-blocking. On regular files this has
* the effect of bypassing mandatory locking. Of
* course, this should normally not be necessary given
* the check above, but let's better be safe than
* sorry, after all NFS is pretty confusing regarding
* file system flags, and we better don't trust it,
* and so is SMB. */
r = fd_nonblock(fd, true);
if (r < 0) {
log_error_errno(r, "Failed to make fd non-blocking, not processing it: %m");
return;
}
/* The file is not sealed, we can't map the file here, since
* clients might then truncate it and trigger a SIGBUS for
* us. So let's stupidly read it. */
p = malloc(st.st_size);
if (!p) {
log_oom();
return;
}
n = pread(fd, p, st.st_size, 0);
if (n < 0)
log_error_errno(errno, "Failed to read file, ignoring: %m");
else if (n > 0)
server_process_native_message(s, p, n, ucred, tv, label, label_len);
}
}
int server_open_native_socket(Server *s, const char *native_socket) {
2014-11-02 21:46:42 +01:00
int r;
assert(s);
assert(native_socket);
if (s->native_fd < 0) {
union sockaddr_union sa;
size_t sa_len;
r = sockaddr_un_set_path(&sa.un, native_socket);
if (r < 0)
return log_error_errno(r, "Unable to use namespace path %s for AF_UNIX socket: %m", native_socket);
sa_len = r;
s->native_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
if (s->native_fd < 0)
return log_error_errno(errno, "socket() failed: %m");
(void) sockaddr_un_unlink(&sa.un);
r = bind(s->native_fd, &sa.sa, sa_len);
if (r < 0)
return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
(void) chmod(sa.un.sun_path, 0666);
} else
(void) fd_nonblock(s->native_fd, true);
r = setsockopt_int(s->native_fd, SOL_SOCKET, SO_PASSCRED, true);
if (r < 0)
return log_error_errno(r, "SO_PASSCRED failed: %m");
#if HAVE_SELINUX
if (mac_selinux_use()) {
r = setsockopt_int(s->native_fd, SOL_SOCKET, SO_PASSSEC, true);
if (r < 0)
log_warning_errno(r, "SO_PASSSEC failed: %m");
}
#endif
r = setsockopt_int(s->native_fd, SOL_SOCKET, SO_TIMESTAMP, true);
if (r < 0)
return log_error_errno(r, "SO_TIMESTAMP failed: %m");
r = sd_event_add_io(s->event, &s->native_event_source, s->native_fd, EPOLLIN, server_process_datagram, s);
if (r < 0)
return log_error_errno(r, "Failed to add native server fd to event loop: %m");
r = sd_event_source_set_priority(s->native_event_source, SD_EVENT_PRIORITY_NORMAL+5);
if (r < 0)
return log_error_errno(r, "Failed to adjust native event source priority: %m");
return 0;
}