Systemd/src/journal/journald-kmsg.c

455 lines
14 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <fcntl.h>
#include <sys/epoll.h>
#include <sys/mman.h>
2012-08-24 01:46:38 +02:00
#include <sys/socket.h>
#include <unistd.h>
#include "sd-device.h"
#include "sd-messages.h"
#include "alloc-util.h"
#include "device-util.h"
#include "escape.h"
#include "fd-util.h"
#include "format-util.h"
#include "io-util.h"
#include "journald-kmsg.h"
#include "journald-server.h"
#include "journald-syslog.h"
#include "parse-util.h"
2015-04-10 19:10:00 +02:00
#include "process-util.h"
#include "stdio-util.h"
#include "string-util.h"
void server_forward_kmsg(
2019-11-22 15:20:42 +01:00
Server *s,
int priority,
const char *identifier,
const char *message,
const struct ucred *ucred) {
_cleanup_free_ char *ident_buf = NULL;
struct iovec iovec[5];
char header_priority[DECIMAL_STR_MAX(priority) + 3],
header_pid[STRLEN("[]: ") + DECIMAL_STR_MAX(pid_t) + 1];
int n = 0;
assert(s);
assert(priority >= 0);
assert(priority <= 999);
assert(message);
if (_unlikely_(LOG_PRI(priority) > s->max_level_kmsg))
return;
if (_unlikely_(s->dev_kmsg_fd < 0))
return;
/* Never allow messages with kernel facility to be written to
* kmsg, regardless where the data comes from. */
priority = syslog_fixup_facility(priority);
/* First: priority field */
xsprintf(header_priority, "<%i>", priority);
iovec[n++] = IOVEC_MAKE_STRING(header_priority);
/* Second: identifier and PID */
if (ucred) {
if (!identifier) {
(void) get_process_comm(ucred->pid, &ident_buf);
identifier = ident_buf;
}
xsprintf(header_pid, "["PID_FMT"]: ", ucred->pid);
if (identifier)
iovec[n++] = IOVEC_MAKE_STRING(identifier);
iovec[n++] = IOVEC_MAKE_STRING(header_pid);
} else if (identifier) {
iovec[n++] = IOVEC_MAKE_STRING(identifier);
iovec[n++] = IOVEC_MAKE_STRING(": ");
}
/* Fourth: message */
iovec[n++] = IOVEC_MAKE_STRING(message);
iovec[n++] = IOVEC_MAKE_STRING("\n");
if (writev(s->dev_kmsg_fd, iovec, n) < 0)
log_debug_errno(errno, "Failed to write to /dev/kmsg for logging: %m");
}
static bool is_us(const char *identifier, const char *pid) {
pid_t pid_num;
if (!identifier || !pid)
return false;
if (parse_pid(pid, &pid_num) < 0)
return false;
return pid_num == getpid_cached() &&
streq(identifier, program_invocation_short_name);
}
void dev_kmsg_record(Server *s, char *p, size_t l) {
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
_cleanup_free_ char *message = NULL, *syslog_priority = NULL, *syslog_pid = NULL, *syslog_facility = NULL, *syslog_identifier = NULL, *source_time = NULL, *identifier = NULL, *pid = NULL;
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
struct iovec iovec[N_IOVEC_META_FIELDS + 7 + N_IOVEC_KERNEL_FIELDS + 2 + N_IOVEC_UDEV_FIELDS];
char *kernel_device = NULL;
unsigned long long usec;
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
size_t n = 0, z = 0, j;
int priority, r;
char *e, *f, *k;
uint64_t serial;
size_t pl;
assert(s);
assert(p);
if (l <= 0)
return;
e = memchr(p, ',', l);
if (!e)
return;
*e = 0;
r = safe_atoi(p, &priority);
if (r < 0 || priority < 0 || priority > 999)
return;
if (s->forward_to_kmsg && LOG_FAC(priority) != LOG_KERN)
return;
l -= (e - p) + 1;
p = e + 1;
e = memchr(p, ',', l);
if (!e)
return;
*e = 0;
r = safe_atou64(p, &serial);
if (r < 0)
return;
if (s->kernel_seqnum) {
/* We already read this one? */
if (serial < *s->kernel_seqnum)
return;
/* Did we lose any? */
if (serial > *s->kernel_seqnum)
server_driver_message(s, 0,
tree-wide: add SD_ID128_MAKE_STR, remove LOG_MESSAGE_ID Embedding sd_id128_t's in constant strings was rather cumbersome. We had SD_ID128_CONST_STR which returned a const char[], but it had two problems: - it wasn't possible to statically concatanate this array with a normal string - gcc wasn't really able to optimize this, and generated code to perform the "conversion" at runtime. Because of this, even our own code in coredumpctl wasn't using SD_ID128_CONST_STR. Add a new macro to generate a constant string: SD_ID128_MAKE_STR. It is not as elegant as SD_ID128_CONST_STR, because it requires a repetition of the numbers, but in practice it is more convenient to use, and allows gcc to generate smarter code: $ size .libs/systemd{,-logind,-journald}{.old,} text data bss dec hex filename 1265204 149564 4808 1419576 15a938 .libs/systemd.old 1260268 149564 4808 1414640 1595f0 .libs/systemd 246805 13852 209 260866 3fb02 .libs/systemd-logind.old 240973 13852 209 255034 3e43a .libs/systemd-logind 146839 4984 34 151857 25131 .libs/systemd-journald.old 146391 4984 34 151409 24f71 .libs/systemd-journald It is also much easier to check if a certain binary uses a certain MESSAGE_ID: $ strings .libs/systemd.old|grep MESSAGE_ID MESSAGE_ID=%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x MESSAGE_ID=%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x MESSAGE_ID=%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x MESSAGE_ID=%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x $ strings .libs/systemd|grep MESSAGE_ID MESSAGE_ID=c7a787079b354eaaa9e77b371893cd27 MESSAGE_ID=b07a249cd024414a82dd00cd181378ff MESSAGE_ID=641257651c1b4ec9a8624d7a40a9e1e7 MESSAGE_ID=de5b426a63be47a7b6ac3eaac82e2f6f MESSAGE_ID=d34d037fff1847e6ae669a370e694725 MESSAGE_ID=7d4958e842da4a758f6c1cdc7b36dcc5 MESSAGE_ID=1dee0369c7fc4736b7099b38ecb46ee7 MESSAGE_ID=39f53479d3a045ac8e11786248231fbf MESSAGE_ID=be02cf6855d2428ba40df7e9d022f03d MESSAGE_ID=7b05ebc668384222baa8881179cfda54 MESSAGE_ID=9d1aaa27d60140bd96365438aad20286
2016-11-06 18:48:23 +01:00
"MESSAGE_ID=" SD_MESSAGE_JOURNAL_MISSED_STR,
LOG_MESSAGE("Missed %"PRIu64" kernel messages",
serial - *s->kernel_seqnum),
NULL);
/* Make sure we never read this one again. Note that
* we always store the next message serial we expect
* here, simply because this makes handling the first
* message with serial 0 easy. */
*s->kernel_seqnum = serial + 1;
}
l -= (e - p) + 1;
p = e + 1;
f = memchr(p, ';', l);
if (!f)
return;
/* Kernel 3.6 has the flags field, kernel 3.5 lacks that */
e = memchr(p, ',', l);
if (!e || f < e)
e = f;
*e = 0;
r = safe_atollu(p, &usec);
if (r < 0)
return;
l -= (f - p) + 1;
p = f + 1;
e = memchr(p, '\n', l);
if (!e)
return;
*e = 0;
pl = e - p;
l -= (e - p) + 1;
k = e + 1;
for (j = 0; l > 0 && j < N_IOVEC_KERNEL_FIELDS; j++) {
char *m;
2015-09-08 12:23:18 +02:00
/* Metadata fields attached */
if (*k != ' ')
break;
k++, l--;
e = memchr(k, '\n', l);
if (!e)
goto finish;
*e = 0;
if (cunescape_length_with_prefix(k, e - k, "_KERNEL_", UNESCAPE_RELAX, &m) < 0)
break;
if (startswith(m, "_KERNEL_DEVICE="))
kernel_device = m + 15;
iovec[n++] = IOVEC_MAKE_STRING(m);
z++;
l -= (e - k) + 1;
k = e + 1;
}
if (kernel_device) {
_cleanup_(sd_device_unrefp) sd_device *d = NULL;
if (sd_device_new_from_device_id(&d, kernel_device) >= 0) {
const char *g;
char *b;
if (sd_device_get_devname(d, &g) >= 0) {
b = strjoin("_UDEV_DEVNODE=", g);
if (b) {
iovec[n++] = IOVEC_MAKE_STRING(b);
z++;
}
}
if (sd_device_get_sysname(d, &g) >= 0) {
b = strjoin("_UDEV_SYSNAME=", g);
if (b) {
iovec[n++] = IOVEC_MAKE_STRING(b);
z++;
}
}
j = 0;
FOREACH_DEVICE_DEVLINK(d, g) {
if (j >= N_IOVEC_UDEV_FIELDS)
break;
b = strjoin("_UDEV_DEVLINK=", g);
if (b) {
iovec[n++] = IOVEC_MAKE_STRING(b);
z++;
}
j++;
}
}
}
if (asprintf(&source_time, "_SOURCE_MONOTONIC_TIMESTAMP=%llu", usec) >= 0)
iovec[n++] = IOVEC_MAKE_STRING(source_time);
iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=kernel");
if (asprintf(&syslog_priority, "PRIORITY=%i", priority & LOG_PRIMASK) >= 0)
iovec[n++] = IOVEC_MAKE_STRING(syslog_priority);
if (asprintf(&syslog_facility, "SYSLOG_FACILITY=%i", LOG_FAC(priority)) >= 0)
iovec[n++] = IOVEC_MAKE_STRING(syslog_facility);
if (LOG_FAC(priority) == LOG_KERN)
iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER=kernel");
else {
pl -= syslog_parse_identifier((const char**) &p, &identifier, &pid);
/* Avoid any messages we generated ourselves via
* log_info() and friends. */
if (is_us(identifier, pid))
goto finish;
if (identifier) {
syslog_identifier = strjoin("SYSLOG_IDENTIFIER=", identifier);
if (syslog_identifier)
iovec[n++] = IOVEC_MAKE_STRING(syslog_identifier);
}
if (pid) {
syslog_pid = strjoin("SYSLOG_PID=", pid);
if (syslog_pid)
iovec[n++] = IOVEC_MAKE_STRING(syslog_pid);
}
}
if (cunescape_length_with_prefix(p, pl, "MESSAGE=", UNESCAPE_RELAX, &message) >= 0)
iovec[n++] = IOVEC_MAKE_STRING(message);
server_dispatch_message(s, iovec, n, ELEMENTSOF(iovec), NULL, NULL, priority, 0);
finish:
for (j = 0; j < z; j++)
free(iovec[j].iov_base);
}
static int server_read_dev_kmsg(Server *s) {
char buffer[8192+1]; /* the kernel-side limit per record is 8K currently */
ssize_t l;
assert(s);
assert(s->dev_kmsg_fd >= 0);
l = read(s->dev_kmsg_fd, buffer, sizeof(buffer) - 1);
if (l == 0)
return 0;
if (l < 0) {
/* Old kernels who don't allow reading from /dev/kmsg
* return EINVAL when we try. So handle this cleanly,
* but don' try to ever read from it again. */
if (errno == EINVAL) {
s->dev_kmsg_event_source = sd_event_source_unref(s->dev_kmsg_event_source);
return 0;
}
if (IN_SET(errno, EAGAIN, EINTR, EPIPE))
return 0;
return log_error_errno(errno, "Failed to read from /dev/kmsg: %m");
}
dev_kmsg_record(s, buffer, l);
return 1;
}
int server_flush_dev_kmsg(Server *s) {
int r;
assert(s);
if (s->dev_kmsg_fd < 0)
return 0;
if (!s->dev_kmsg_readable)
return 0;
log_debug("Flushing /dev/kmsg...");
for (;;) {
r = server_read_dev_kmsg(s);
if (r < 0)
return r;
if (r == 0)
break;
}
return 0;
}
static int dispatch_dev_kmsg(sd_event_source *es, int fd, uint32_t revents, void *userdata) {
Server *s = userdata;
assert(es);
assert(fd == s->dev_kmsg_fd);
assert(s);
if (revents & EPOLLERR)
log_warning("/dev/kmsg buffer overrun, some messages lost.");
if (!(revents & EPOLLIN))
log_error("Got invalid event from epoll for /dev/kmsg: %"PRIx32, revents);
return server_read_dev_kmsg(s);
}
int server_open_dev_kmsg(Server *s) {
mode_t mode;
int r;
assert(s);
if (s->read_kmsg)
mode = O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY;
else
mode = O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY;
s->dev_kmsg_fd = open("/dev/kmsg", mode);
if (s->dev_kmsg_fd < 0) {
log_full(errno == ENOENT ? LOG_DEBUG : LOG_WARNING,
"Failed to open /dev/kmsg, ignoring: %m");
return 0;
}
if (!s->read_kmsg)
return 0;
r = sd_event_add_io(s->event, &s->dev_kmsg_event_source, s->dev_kmsg_fd, EPOLLIN, dispatch_dev_kmsg, s);
if (r < 0) {
/* This will fail with EPERM on older kernels where
* /dev/kmsg is not readable. */
if (r == -EPERM) {
r = 0;
goto fail;
}
log_error_errno(r, "Failed to add /dev/kmsg fd to event loop: %m");
goto fail;
}
r = sd_event_source_set_priority(s->dev_kmsg_event_source, SD_EVENT_PRIORITY_IMPORTANT+10);
if (r < 0) {
log_error_errno(r, "Failed to adjust priority of kmsg event source: %m");
goto fail;
}
s->dev_kmsg_readable = true;
return 0;
fail:
s->dev_kmsg_event_source = sd_event_source_unref(s->dev_kmsg_event_source);
s->dev_kmsg_fd = safe_close(s->dev_kmsg_fd);
return r;
}
int server_open_kernel_seqnum(Server *s) {
_cleanup_close_ int fd = -1;
const char *fn;
uint64_t *p;
int r;
assert(s);
/* We store the seqnum we last read in an mmapped file. That way we can just use it like a variable,
* but it is persistent and automatically flushed at reboot. */
if (!s->read_kmsg)
return 0;
fn = strjoina(s->runtime_directory, "/kernel-seqnum");
fd = open(fn, O_RDWR|O_CREAT|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
if (fd < 0) {
log_error_errno(errno, "Failed to open %s, ignoring: %m", fn);
return 0;
}
r = posix_fallocate(fd, 0, sizeof(uint64_t));
if (r != 0) {
log_error_errno(r, "Failed to allocate sequential number file, ignoring: %m");
return 0;
}
p = mmap(NULL, sizeof(uint64_t), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (p == MAP_FAILED) {
log_error_errno(errno, "Failed to map sequential number file, ignoring: %m");
return 0;
}
s->kernel_seqnum = p;
return 0;
}