Systemd/src/journal/journal-file.c

3915 lines
130 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: LGPL-2.1+ */
#include <errno.h>
#include <fcntl.h>
#include <linux/fs.h>
#include <pthread.h>
#include <stddef.h>
#include <sys/mman.h>
#include <sys/statvfs.h>
#include <sys/uio.h>
#include <unistd.h>
#include "sd-event.h"
#include "alloc-util.h"
#include "btrfs-util.h"
#include "chattr-util.h"
#include "compress.h"
#include "fd-util.h"
#include "fs-util.h"
2012-08-16 01:51:54 +02:00
#include "journal-authenticate.h"
#include "journal-def.h"
#include "journal-file.h"
#include "lookup3.h"
#include "memory-util.h"
#include "parse-util.h"
#include "path-util.h"
2015-04-10 22:27:10 +02:00
#include "random-util.h"
#include "set.h"
#include "sort-util.h"
#include "stat-util.h"
#include "string-util.h"
#include "strv.h"
#include "xattr-util.h"
#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
#define DEFAULT_COMPRESS_THRESHOLD (512ULL)
#define MIN_COMPRESS_THRESHOLD (8ULL)
/* This is the minimum journal file size */
#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
/* These are the lower and upper bounds if we deduce the max_use value
* from the file system size */
#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
/* This is the upper bound if we deduce max_size from max_use */
#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
/* This is the upper bound if we deduce the keep_free value from the
* file system size */
#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
/* This is the keep_free value when we can't determine the system
* size */
#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
/* This is the default maximum number of journal files to keep around. */
#define DEFAULT_N_MAX_FILES (100)
/* n_data was the first entry we added after the initial file format design */
#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
/* How many entries to keep in the entry array chain cache at max */
#define CHAIN_CACHE_MAX 20
/* How much to increase the journal file size at once each time we allocate something new. */
#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
/* Reread fstat() of the file for detecting deletions at least this often */
#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
/* The mmap context to use for the header we pick as one above the last defined typed */
#define CONTEXT_HEADER _OBJECT_TYPE_MAX
#ifdef __clang__
# pragma GCC diagnostic ignored "-Waddress-of-packed-member"
#endif
/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
* As a result we use atomic operations on f->offline_state for inter-thread communications with
* journal_file_set_offline() and journal_file_set_online(). */
static void journal_file_set_offline_internal(JournalFile *f) {
assert(f);
assert(f->fd >= 0);
assert(f->header);
for (;;) {
switch (f->offline_state) {
case OFFLINE_CANCEL:
if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
continue;
return;
case OFFLINE_AGAIN_FROM_SYNCING:
if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
continue;
break;
case OFFLINE_AGAIN_FROM_OFFLINING:
if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
continue;
break;
case OFFLINE_SYNCING:
(void) fsync(f->fd);
if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
continue;
f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
(void) fsync(f->fd);
break;
case OFFLINE_OFFLINING:
if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
continue;
_fallthrough_;
case OFFLINE_DONE:
return;
case OFFLINE_JOINED:
log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
return;
}
}
}
static void * journal_file_set_offline_thread(void *arg) {
JournalFile *f = arg;
(void) pthread_setname_np(pthread_self(), "journal-offline");
journal_file_set_offline_internal(f);
return NULL;
}
static int journal_file_set_offline_thread_join(JournalFile *f) {
int r;
assert(f);
if (f->offline_state == OFFLINE_JOINED)
return 0;
r = pthread_join(f->offline_thread, NULL);
if (r)
return -r;
f->offline_state = OFFLINE_JOINED;
if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
return -EIO;
return 0;
}
/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
static bool journal_file_set_offline_try_restart(JournalFile *f) {
for (;;) {
switch (f->offline_state) {
case OFFLINE_AGAIN_FROM_SYNCING:
case OFFLINE_AGAIN_FROM_OFFLINING:
return true;
case OFFLINE_CANCEL:
if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
continue;
return true;
case OFFLINE_SYNCING:
if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
continue;
return true;
case OFFLINE_OFFLINING:
if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
continue;
return true;
default:
return false;
}
}
}
/* Sets a journal offline.
*
* If wait is false then an offline is dispatched in a separate thread for a
* subsequent journal_file_set_offline() or journal_file_set_online() of the
* same journal to synchronize with.
*
* If wait is true, then either an existing offline thread will be restarted
* and joined, or if none exists the offline is simply performed in this
* context without involving another thread.
*/
int journal_file_set_offline(JournalFile *f, bool wait) {
bool restarted;
int r;
assert(f);
if (!f->writable)
return -EPERM;
2018-05-29 17:01:53 +02:00
if (f->fd < 0 || !f->header)
return -EINVAL;
/* An offlining journal is implicitly online and may modify f->header->state,
* we must also join any potentially lingering offline thread when not online. */
if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
return journal_file_set_offline_thread_join(f);
/* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
restarted = journal_file_set_offline_try_restart(f);
if ((restarted && wait) || !restarted) {
r = journal_file_set_offline_thread_join(f);
if (r < 0)
return r;
}
if (restarted)
return 0;
/* Initiate a new offline. */
f->offline_state = OFFLINE_SYNCING;
if (wait) /* Without using a thread if waiting. */
journal_file_set_offline_internal(f);
else {
sigset_t ss, saved_ss;
int k;
assert_se(sigfillset(&ss) >= 0);
/* Don't block SIGBUS since the offlining thread accesses a memory mapped file.
* Asynchronous SIGBUS signals can safely be handled by either thread. */
assert_se(sigdelset(&ss, SIGBUS) >= 0);
r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
if (r > 0)
return -r;
r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
if (r > 0) {
f->offline_state = OFFLINE_JOINED;
return -r;
}
if (k > 0)
return -k;
}
return 0;
}
static int journal_file_set_online(JournalFile *f) {
bool wait = true;
assert(f);
if (!f->writable)
return -EPERM;
2018-05-29 17:01:53 +02:00
if (f->fd < 0 || !f->header)
return -EINVAL;
while (wait) {
switch (f->offline_state) {
case OFFLINE_JOINED:
/* No offline thread, no need to wait. */
wait = false;
break;
case OFFLINE_SYNCING:
if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
continue;
/* Canceled syncing prior to offlining, no need to wait. */
wait = false;
break;
case OFFLINE_AGAIN_FROM_SYNCING:
if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
continue;
/* Canceled restart from syncing, no need to wait. */
wait = false;
break;
case OFFLINE_AGAIN_FROM_OFFLINING:
if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
continue;
/* Canceled restart from offlining, must wait for offlining to complete however. */
_fallthrough_;
default: {
int r;
r = journal_file_set_offline_thread_join(f);
if (r < 0)
return r;
wait = false;
break;
}
}
}
if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
return -EIO;
switch (f->header->state) {
case STATE_ONLINE:
return 0;
case STATE_OFFLINE:
f->header->state = STATE_ONLINE;
(void) fsync(f->fd);
return 0;
default:
return -EINVAL;
}
}
bool journal_file_is_offlining(JournalFile *f) {
assert(f);
__sync_synchronize();
if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
return false;
return true;
}
JournalFile* journal_file_close(JournalFile *f) {
assert(f);
#if HAVE_GCRYPT
/* Write the final tag */
if (f->seal && f->writable) {
int r;
r = journal_file_append_tag(f);
if (r < 0)
log_error_errno(r, "Failed to append tag when closing journal: %m");
}
#endif
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
if (f->post_change_timer) {
if (sd_event_source_get_enabled(f->post_change_timer, NULL) > 0)
journal_file_post_change(f);
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
2019-05-08 14:47:32 +02:00
sd_event_source_disable_unref(f->post_change_timer);
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
}
journal_file_set_offline(f, true);
if (f->mmap && f->cache_fd)
mmap_cache_free_fd(f->mmap, f->cache_fd);
if (f->fd >= 0 && f->defrag_on_close) {
/* Be friendly to btrfs: turn COW back on again now,
* and defragment the file. We won't write to the file
* ever again, hence remove all fragmentation, and
* reenable all the good bits COW usually provides
* (such as data checksumming). */
(void) chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL);
(void) btrfs_defrag_fd(f->fd);
}
if (f->close_fd)
safe_close(f->fd);
free(f->path);
mmap_cache_unref(f->mmap);
ordered_hashmap_free_free(f->chain_cache);
#if HAVE_XZ || HAVE_LZ4
free(f->compress_buffer);
#endif
#if HAVE_GCRYPT
if (f->fss_file)
munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
else
2012-08-16 20:51:43 +02:00
free(f->fsprg_state);
free(f->fsprg_seed);
if (f->hmac)
gcry_md_close(f->hmac);
#endif
2016-10-17 00:28:30 +02:00
return mfree(f);
}
2011-10-13 05:19:35 +02:00
static int journal_file_init_header(JournalFile *f, JournalFile *template) {
Header h = {};
ssize_t k;
int r;
assert(f);
memcpy(h.signature, HEADER_SIGNATURE, 8);
h.header_size = htole64(ALIGN64(sizeof(h)));
h.incompatible_flags |= htole32(
f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
h.compatible_flags = htole32(
f->seal * HEADER_COMPATIBLE_SEALED);
r = sd_id128_randomize(&h.file_id);
if (r < 0)
return r;
2011-10-13 05:19:35 +02:00
if (template) {
h.seqnum_id = template->header->seqnum_id;
h.tail_entry_seqnum = template->header->tail_entry_seqnum;
2011-10-13 05:19:35 +02:00
} else
h.seqnum_id = h.file_id;
k = pwrite(f->fd, &h, sizeof(h), 0);
if (k < 0)
return -errno;
if (k != sizeof(h))
return -EIO;
return 0;
}
static int journal_file_refresh_header(JournalFile *f) {
sd_id128_t boot_id;
int r;
assert(f);
assert(f->header);
r = sd_id128_get_machine(&f->header->machine_id);
if (IN_SET(r, -ENOENT, -ENOMEDIUM))
/* We don't have a machine-id, let's continue without */
zero(f->header->machine_id);
else if (r < 0)
return r;
r = sd_id128_get_boot(&boot_id);
if (r < 0)
return r;
f->header->boot_id = boot_id;
r = journal_file_set_online(f);
/* Sync the online state to disk */
(void) fsync(f->fd);
/* We likely just created a new file, also sync the directory this file is located in. */
(void) fsync_directory_of_file(f->fd);
return r;
}
static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
const char *type = compatible ? "compatible" : "incompatible";
uint32_t flags;
flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
if (flags & ~supported) {
if (flags & ~any)
log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
f->path, type, flags & ~any);
flags = (flags & any) & ~supported;
if (flags) {
const char* strv[3];
unsigned n = 0;
_cleanup_free_ char *t = NULL;
if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
strv[n++] = "sealed";
if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
strv[n++] = "xz-compressed";
if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
strv[n++] = "lz4-compressed";
strv[n] = NULL;
assert(n < ELEMENTSOF(strv));
t = strv_join((char**) strv, ", ");
log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
f->path, type, n > 1 ? "flags" : "flag", strnull(t));
}
return true;
}
return false;
}
static int journal_file_verify_header(JournalFile *f) {
uint64_t arena_size, header_size;
assert(f);
assert(f->header);
if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
return -EBADMSG;
/* In both read and write mode we refuse to open files with incompatible
* flags we don't know. */
if (warn_wrong_flags(f, false))
return -EPROTONOSUPPORT;
/* When open for writing we refuse to open files with compatible flags, too. */
if (f->writable && warn_wrong_flags(f, true))
return -EPROTONOSUPPORT;
if (f->header->state >= _STATE_MAX)
return -EBADMSG;
header_size = le64toh(f->header->header_size);
/* The first addition was n_data, so check that we are at least this large */
if (header_size < HEADER_SIZE_MIN)
return -EBADMSG;
if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
return -EBADMSG;
arena_size = le64toh(f->header->arena_size);
if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
return -ENODATA;
if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
return -ENODATA;
if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
!VALID64(le64toh(f->header->field_hash_table_offset)) ||
!VALID64(le64toh(f->header->tail_object_offset)) ||
!VALID64(le64toh(f->header->entry_array_offset)))
return -ENODATA;
if (f->writable) {
sd_id128_t machine_id;
uint8_t state;
int r;
r = sd_id128_get_machine(&machine_id);
if (r < 0)
return r;
if (!sd_id128_equal(machine_id, f->header->machine_id))
return -EHOSTDOWN;
state = f->header->state;
if (state == STATE_ARCHIVED)
return -ESHUTDOWN; /* Already archived */
else if (state == STATE_ONLINE)
return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
"Journal file %s is already online. Assuming unclean closing.",
f->path);
else if (state != STATE_OFFLINE)
return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
"Journal file %s has unknown state %i.",
f->path, state);
if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
return -EBADMSG;
/* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
* be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
* bisection. */
if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME))
return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY),
"Journal file %s is from the future, refusing to append new data to it that'd be older.",
f->path);
}
f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
2012-08-18 00:38:57 +02:00
f->seal = JOURNAL_HEADER_SEALED(f->header);
return 0;
}
static int journal_file_fstat(JournalFile *f) {
int r;
assert(f);
assert(f->fd >= 0);
if (fstat(f->fd, &f->last_stat) < 0)
return -errno;
f->last_stat_usec = now(CLOCK_MONOTONIC);
/* Refuse dealing with with files that aren't regular */
r = stat_verify_regular(&f->last_stat);
if (r < 0)
return r;
/* Refuse appending to files that are already deleted */
if (f->last_stat.st_nlink <= 0)
return -EIDRM;
return 0;
}
static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
uint64_t old_size, new_size;
int r;
assert(f);
assert(f->header);
/* We assume that this file is not sparse, and we know that
* for sure, since we always call posix_fallocate()
* ourselves */
if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
return -EIO;
old_size =
le64toh(f->header->header_size) +
le64toh(f->header->arena_size);
new_size = PAGE_ALIGN(offset + size);
if (new_size < le64toh(f->header->header_size))
new_size = le64toh(f->header->header_size);
if (new_size <= old_size) {
/* We already pre-allocated enough space, but before
* we write to it, let's check with fstat() if the
* file got deleted, in order make sure we don't throw
* away the data immediately. Don't check fstat() for
* all writes though, but only once ever 10s. */
if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
return 0;
return journal_file_fstat(f);
}
/* Allocate more space. */
if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
return -E2BIG;
if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
struct statvfs svfs;
if (fstatvfs(f->fd, &svfs) >= 0) {
uint64_t available;
available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
if (new_size - old_size > available)
return -E2BIG;
}
}
/* Increase by larger blocks at once */
new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
new_size = f->metrics.max_size;
/* Note that the glibc fallocate() fallback is very
inefficient, hence we try to minimize the allocation area
as we can. */
r = posix_fallocate(f->fd, old_size, new_size - old_size);
if (r != 0)
return -r;
f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
return journal_file_fstat(f);
}
2014-12-10 15:18:49 +01:00
static unsigned type_to_context(ObjectType type) {
/* One context for each type, plus one catch-all for the rest */
assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
}
static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
int r;
assert(f);
assert(ret);
if (size <= 0)
return -EINVAL;
2011-12-28 01:53:06 +01:00
/* Avoid SIGBUS on invalid accesses */
if (offset + size > (uint64_t) f->last_stat.st_size) {
/* Hmm, out of range? Let's refresh the fstat() data
* first, before we trust that check. */
r = journal_file_fstat(f);
if (r < 0)
return r;
if (offset + size > (uint64_t) f->last_stat.st_size)
return -EADDRNOTAVAIL;
}
return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
}
static uint64_t minimum_header_size(Object *o) {
2013-11-26 18:40:23 +01:00
static const uint64_t table[] = {
[OBJECT_DATA] = sizeof(DataObject),
[OBJECT_FIELD] = sizeof(FieldObject),
[OBJECT_ENTRY] = sizeof(EntryObject),
[OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
[OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
[OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
[OBJECT_TAG] = sizeof(TagObject),
};
if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
return sizeof(ObjectHeader);
return table[o->object.type];
}
journal: add object sanity check to journal_file_move_to_object() Introduce journal_file_check_object(), which does lightweight object sanity checks, and use it in journal_file_move_to_object(), so that we will catch certain corrupted objects in the journal file. This fixes #6447, where we had only partially written out OBJECT_ENTRY (ObjectHeader written, but rest of object zero bytes), causing "journalctl --list-boots" to fail. $ builddir.vanilla/journalctl --list-boots -D bug6447/ Failed to determine boots: No data available $ builddir.patched/journalctl --list-boots -D bug6447/ -52 22633da1c5374a728d6c215e2c301dc2 Mon 2017-07-10 05:29:21 EEST—Mon 2017-07-10 05:31:51 EEST -51 2253aab9ea7e4a2598f2abda82939eff Mon 2017-07-10 05:32:22 EEST—Mon 2017-07-10 05:36:49 EEST -50 ef0d85d35c74486fa4104f9d6391b6ba Mon 2017-07-10 05:40:33 EEST—Mon 2017-07-10 05:40:40 EEST [...] Note that journal_file_check_object() is similar to journal_file_object_verify(). The most expensive checks are omitted, as they would slow down every journal_file_move_to_object() call too much. With this implementation, the added overhead is small, for example when dumping some journal content to /dev/null (built with -Dbuildtype=debugoptimized -Db_ndebug=true): Performance counter stats for 'builddir.vanilla/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12542,311634 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 100 page-faults:u # 0,006 M/sec 41 786 963 456 cycles:u # 3,332 GHz 105 453 864 770 instructions:u # 2,52 insn per cycle 24 342 227 334 branches:u # 1940,809 M/sec 105 709 217 branch-misses:u # 0,43% of all branches 12,545199291 seconds time elapsed Performance counter stats for 'builddir.patched/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12734,723233 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 693 page-faults:u # 0,006 M/sec 42 661 017 429 cycles:u # 3,350 GHz 107 696 985 865 instructions:u # 2,52 insn per cycle 24 950 526 745 branches:u # 1959,252 M/sec 101 762 806 branch-misses:u # 0,41% of all branches 12,737527327 seconds time elapsed Fixes #6447.
2017-09-19 10:10:49 +02:00
/* Lightweight object checks. We want this to be fast, so that we won't
* slowdown every journal_file_move_to_object() call too much. */
static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
assert(f);
assert(o);
switch (o->object.type) {
case OBJECT_DATA: {
if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Bad n_entries: %" PRIu64 ": %" PRIu64,
le64toh(o->data.n_entries),
offset);
if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,
offsetof(DataObject, payload),
le64toh(o->object.size),
offset);
journal: add object sanity check to journal_file_move_to_object() Introduce journal_file_check_object(), which does lightweight object sanity checks, and use it in journal_file_move_to_object(), so that we will catch certain corrupted objects in the journal file. This fixes #6447, where we had only partially written out OBJECT_ENTRY (ObjectHeader written, but rest of object zero bytes), causing "journalctl --list-boots" to fail. $ builddir.vanilla/journalctl --list-boots -D bug6447/ Failed to determine boots: No data available $ builddir.patched/journalctl --list-boots -D bug6447/ -52 22633da1c5374a728d6c215e2c301dc2 Mon 2017-07-10 05:29:21 EEST—Mon 2017-07-10 05:31:51 EEST -51 2253aab9ea7e4a2598f2abda82939eff Mon 2017-07-10 05:32:22 EEST—Mon 2017-07-10 05:36:49 EEST -50 ef0d85d35c74486fa4104f9d6391b6ba Mon 2017-07-10 05:40:33 EEST—Mon 2017-07-10 05:40:40 EEST [...] Note that journal_file_check_object() is similar to journal_file_object_verify(). The most expensive checks are omitted, as they would slow down every journal_file_move_to_object() call too much. With this implementation, the added overhead is small, for example when dumping some journal content to /dev/null (built with -Dbuildtype=debugoptimized -Db_ndebug=true): Performance counter stats for 'builddir.vanilla/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12542,311634 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 100 page-faults:u # 0,006 M/sec 41 786 963 456 cycles:u # 3,332 GHz 105 453 864 770 instructions:u # 2,52 insn per cycle 24 342 227 334 branches:u # 1940,809 M/sec 105 709 217 branch-misses:u # 0,43% of all branches 12,545199291 seconds time elapsed Performance counter stats for 'builddir.patched/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12734,723233 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 693 page-faults:u # 0,006 M/sec 42 661 017 429 cycles:u # 3,350 GHz 107 696 985 865 instructions:u # 2,52 insn per cycle 24 950 526 745 branches:u # 1959,252 M/sec 101 762 806 branch-misses:u # 0,41% of all branches 12,737527327 seconds time elapsed Fixes #6447.
2017-09-19 10:10:49 +02:00
if (!VALID64(le64toh(o->data.next_hash_offset)) ||
!VALID64(le64toh(o->data.next_field_offset)) ||
!VALID64(le64toh(o->data.entry_offset)) ||
!VALID64(le64toh(o->data.entry_array_offset)))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64,
le64toh(o->data.next_hash_offset),
le64toh(o->data.next_field_offset),
le64toh(o->data.entry_offset),
le64toh(o->data.entry_array_offset),
offset);
journal: add object sanity check to journal_file_move_to_object() Introduce journal_file_check_object(), which does lightweight object sanity checks, and use it in journal_file_move_to_object(), so that we will catch certain corrupted objects in the journal file. This fixes #6447, where we had only partially written out OBJECT_ENTRY (ObjectHeader written, but rest of object zero bytes), causing "journalctl --list-boots" to fail. $ builddir.vanilla/journalctl --list-boots -D bug6447/ Failed to determine boots: No data available $ builddir.patched/journalctl --list-boots -D bug6447/ -52 22633da1c5374a728d6c215e2c301dc2 Mon 2017-07-10 05:29:21 EEST—Mon 2017-07-10 05:31:51 EEST -51 2253aab9ea7e4a2598f2abda82939eff Mon 2017-07-10 05:32:22 EEST—Mon 2017-07-10 05:36:49 EEST -50 ef0d85d35c74486fa4104f9d6391b6ba Mon 2017-07-10 05:40:33 EEST—Mon 2017-07-10 05:40:40 EEST [...] Note that journal_file_check_object() is similar to journal_file_object_verify(). The most expensive checks are omitted, as they would slow down every journal_file_move_to_object() call too much. With this implementation, the added overhead is small, for example when dumping some journal content to /dev/null (built with -Dbuildtype=debugoptimized -Db_ndebug=true): Performance counter stats for 'builddir.vanilla/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12542,311634 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 100 page-faults:u # 0,006 M/sec 41 786 963 456 cycles:u # 3,332 GHz 105 453 864 770 instructions:u # 2,52 insn per cycle 24 342 227 334 branches:u # 1940,809 M/sec 105 709 217 branch-misses:u # 0,43% of all branches 12,545199291 seconds time elapsed Performance counter stats for 'builddir.patched/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12734,723233 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 693 page-faults:u # 0,006 M/sec 42 661 017 429 cycles:u # 3,350 GHz 107 696 985 865 instructions:u # 2,52 insn per cycle 24 950 526 745 branches:u # 1959,252 M/sec 101 762 806 branch-misses:u # 0,41% of all branches 12,737527327 seconds time elapsed Fixes #6447.
2017-09-19 10:10:49 +02:00
break;
}
case OBJECT_FIELD:
if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Bad field size (<= %zu): %" PRIu64 ": %" PRIu64,
offsetof(FieldObject, payload),
le64toh(o->object.size),
offset);
journal: add object sanity check to journal_file_move_to_object() Introduce journal_file_check_object(), which does lightweight object sanity checks, and use it in journal_file_move_to_object(), so that we will catch certain corrupted objects in the journal file. This fixes #6447, where we had only partially written out OBJECT_ENTRY (ObjectHeader written, but rest of object zero bytes), causing "journalctl --list-boots" to fail. $ builddir.vanilla/journalctl --list-boots -D bug6447/ Failed to determine boots: No data available $ builddir.patched/journalctl --list-boots -D bug6447/ -52 22633da1c5374a728d6c215e2c301dc2 Mon 2017-07-10 05:29:21 EEST—Mon 2017-07-10 05:31:51 EEST -51 2253aab9ea7e4a2598f2abda82939eff Mon 2017-07-10 05:32:22 EEST—Mon 2017-07-10 05:36:49 EEST -50 ef0d85d35c74486fa4104f9d6391b6ba Mon 2017-07-10 05:40:33 EEST—Mon 2017-07-10 05:40:40 EEST [...] Note that journal_file_check_object() is similar to journal_file_object_verify(). The most expensive checks are omitted, as they would slow down every journal_file_move_to_object() call too much. With this implementation, the added overhead is small, for example when dumping some journal content to /dev/null (built with -Dbuildtype=debugoptimized -Db_ndebug=true): Performance counter stats for 'builddir.vanilla/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12542,311634 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 100 page-faults:u # 0,006 M/sec 41 786 963 456 cycles:u # 3,332 GHz 105 453 864 770 instructions:u # 2,52 insn per cycle 24 342 227 334 branches:u # 1940,809 M/sec 105 709 217 branch-misses:u # 0,43% of all branches 12,545199291 seconds time elapsed Performance counter stats for 'builddir.patched/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12734,723233 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 693 page-faults:u # 0,006 M/sec 42 661 017 429 cycles:u # 3,350 GHz 107 696 985 865 instructions:u # 2,52 insn per cycle 24 950 526 745 branches:u # 1959,252 M/sec 101 762 806 branch-misses:u # 0,41% of all branches 12,737527327 seconds time elapsed Fixes #6447.
2017-09-19 10:10:49 +02:00
if (!VALID64(le64toh(o->field.next_hash_offset)) ||
!VALID64(le64toh(o->field.head_data_offset)))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64,
le64toh(o->field.next_hash_offset),
le64toh(o->field.head_data_offset),
offset);
journal: add object sanity check to journal_file_move_to_object() Introduce journal_file_check_object(), which does lightweight object sanity checks, and use it in journal_file_move_to_object(), so that we will catch certain corrupted objects in the journal file. This fixes #6447, where we had only partially written out OBJECT_ENTRY (ObjectHeader written, but rest of object zero bytes), causing "journalctl --list-boots" to fail. $ builddir.vanilla/journalctl --list-boots -D bug6447/ Failed to determine boots: No data available $ builddir.patched/journalctl --list-boots -D bug6447/ -52 22633da1c5374a728d6c215e2c301dc2 Mon 2017-07-10 05:29:21 EEST—Mon 2017-07-10 05:31:51 EEST -51 2253aab9ea7e4a2598f2abda82939eff Mon 2017-07-10 05:32:22 EEST—Mon 2017-07-10 05:36:49 EEST -50 ef0d85d35c74486fa4104f9d6391b6ba Mon 2017-07-10 05:40:33 EEST—Mon 2017-07-10 05:40:40 EEST [...] Note that journal_file_check_object() is similar to journal_file_object_verify(). The most expensive checks are omitted, as they would slow down every journal_file_move_to_object() call too much. With this implementation, the added overhead is small, for example when dumping some journal content to /dev/null (built with -Dbuildtype=debugoptimized -Db_ndebug=true): Performance counter stats for 'builddir.vanilla/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12542,311634 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 100 page-faults:u # 0,006 M/sec 41 786 963 456 cycles:u # 3,332 GHz 105 453 864 770 instructions:u # 2,52 insn per cycle 24 342 227 334 branches:u # 1940,809 M/sec 105 709 217 branch-misses:u # 0,43% of all branches 12,545199291 seconds time elapsed Performance counter stats for 'builddir.patched/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12734,723233 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 693 page-faults:u # 0,006 M/sec 42 661 017 429 cycles:u # 3,350 GHz 107 696 985 865 instructions:u # 2,52 insn per cycle 24 950 526 745 branches:u # 1959,252 M/sec 101 762 806 branch-misses:u # 0,41% of all branches 12,737527327 seconds time elapsed Fixes #6447.
2017-09-19 10:10:49 +02:00
break;
case OBJECT_ENTRY:
if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,
offsetof(EntryObject, items),
le64toh(o->object.size),
offset);
if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid number items in entry: %" PRIu64 ": %" PRIu64,
(le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
offset);
if (le64toh(o->entry.seqnum) <= 0)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid entry seqnum: %" PRIx64 ": %" PRIu64,
le64toh(o->entry.seqnum),
offset);
if (!VALID_REALTIME(le64toh(o->entry.realtime)))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64,
le64toh(o->entry.realtime),
offset);
if (!VALID_MONOTONIC(le64toh(o->entry.monotonic)))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64,
le64toh(o->entry.monotonic),
offset);
journal: add object sanity check to journal_file_move_to_object() Introduce journal_file_check_object(), which does lightweight object sanity checks, and use it in journal_file_move_to_object(), so that we will catch certain corrupted objects in the journal file. This fixes #6447, where we had only partially written out OBJECT_ENTRY (ObjectHeader written, but rest of object zero bytes), causing "journalctl --list-boots" to fail. $ builddir.vanilla/journalctl --list-boots -D bug6447/ Failed to determine boots: No data available $ builddir.patched/journalctl --list-boots -D bug6447/ -52 22633da1c5374a728d6c215e2c301dc2 Mon 2017-07-10 05:29:21 EEST—Mon 2017-07-10 05:31:51 EEST -51 2253aab9ea7e4a2598f2abda82939eff Mon 2017-07-10 05:32:22 EEST—Mon 2017-07-10 05:36:49 EEST -50 ef0d85d35c74486fa4104f9d6391b6ba Mon 2017-07-10 05:40:33 EEST—Mon 2017-07-10 05:40:40 EEST [...] Note that journal_file_check_object() is similar to journal_file_object_verify(). The most expensive checks are omitted, as they would slow down every journal_file_move_to_object() call too much. With this implementation, the added overhead is small, for example when dumping some journal content to /dev/null (built with -Dbuildtype=debugoptimized -Db_ndebug=true): Performance counter stats for 'builddir.vanilla/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12542,311634 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 100 page-faults:u # 0,006 M/sec 41 786 963 456 cycles:u # 3,332 GHz 105 453 864 770 instructions:u # 2,52 insn per cycle 24 342 227 334 branches:u # 1940,809 M/sec 105 709 217 branch-misses:u # 0,43% of all branches 12,545199291 seconds time elapsed Performance counter stats for 'builddir.patched/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12734,723233 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 693 page-faults:u # 0,006 M/sec 42 661 017 429 cycles:u # 3,350 GHz 107 696 985 865 instructions:u # 2,52 insn per cycle 24 950 526 745 branches:u # 1959,252 M/sec 101 762 806 branch-misses:u # 0,41% of all branches 12,737527327 seconds time elapsed Fixes #6447.
2017-09-19 10:10:49 +02:00
break;
case OBJECT_DATA_HASH_TABLE:
case OBJECT_FIELD_HASH_TABLE:
if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
(le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid %s hash table size: %" PRIu64 ": %" PRIu64,
o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
le64toh(o->object.size),
offset);
journal: add object sanity check to journal_file_move_to_object() Introduce journal_file_check_object(), which does lightweight object sanity checks, and use it in journal_file_move_to_object(), so that we will catch certain corrupted objects in the journal file. This fixes #6447, where we had only partially written out OBJECT_ENTRY (ObjectHeader written, but rest of object zero bytes), causing "journalctl --list-boots" to fail. $ builddir.vanilla/journalctl --list-boots -D bug6447/ Failed to determine boots: No data available $ builddir.patched/journalctl --list-boots -D bug6447/ -52 22633da1c5374a728d6c215e2c301dc2 Mon 2017-07-10 05:29:21 EEST—Mon 2017-07-10 05:31:51 EEST -51 2253aab9ea7e4a2598f2abda82939eff Mon 2017-07-10 05:32:22 EEST—Mon 2017-07-10 05:36:49 EEST -50 ef0d85d35c74486fa4104f9d6391b6ba Mon 2017-07-10 05:40:33 EEST—Mon 2017-07-10 05:40:40 EEST [...] Note that journal_file_check_object() is similar to journal_file_object_verify(). The most expensive checks are omitted, as they would slow down every journal_file_move_to_object() call too much. With this implementation, the added overhead is small, for example when dumping some journal content to /dev/null (built with -Dbuildtype=debugoptimized -Db_ndebug=true): Performance counter stats for 'builddir.vanilla/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12542,311634 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 100 page-faults:u # 0,006 M/sec 41 786 963 456 cycles:u # 3,332 GHz 105 453 864 770 instructions:u # 2,52 insn per cycle 24 342 227 334 branches:u # 1940,809 M/sec 105 709 217 branch-misses:u # 0,43% of all branches 12,545199291 seconds time elapsed Performance counter stats for 'builddir.patched/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12734,723233 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 693 page-faults:u # 0,006 M/sec 42 661 017 429 cycles:u # 3,350 GHz 107 696 985 865 instructions:u # 2,52 insn per cycle 24 950 526 745 branches:u # 1959,252 M/sec 101 762 806 branch-misses:u # 0,41% of all branches 12,737527327 seconds time elapsed Fixes #6447.
2017-09-19 10:10:49 +02:00
break;
case OBJECT_ENTRY_ARRAY:
if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
(le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid object entry array size: %" PRIu64 ": %" PRIu64,
le64toh(o->object.size),
offset);
if (!VALID64(le64toh(o->entry_array.next_entry_array_offset)))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64,
le64toh(o->entry_array.next_entry_array_offset),
offset);
journal: add object sanity check to journal_file_move_to_object() Introduce journal_file_check_object(), which does lightweight object sanity checks, and use it in journal_file_move_to_object(), so that we will catch certain corrupted objects in the journal file. This fixes #6447, where we had only partially written out OBJECT_ENTRY (ObjectHeader written, but rest of object zero bytes), causing "journalctl --list-boots" to fail. $ builddir.vanilla/journalctl --list-boots -D bug6447/ Failed to determine boots: No data available $ builddir.patched/journalctl --list-boots -D bug6447/ -52 22633da1c5374a728d6c215e2c301dc2 Mon 2017-07-10 05:29:21 EEST—Mon 2017-07-10 05:31:51 EEST -51 2253aab9ea7e4a2598f2abda82939eff Mon 2017-07-10 05:32:22 EEST—Mon 2017-07-10 05:36:49 EEST -50 ef0d85d35c74486fa4104f9d6391b6ba Mon 2017-07-10 05:40:33 EEST—Mon 2017-07-10 05:40:40 EEST [...] Note that journal_file_check_object() is similar to journal_file_object_verify(). The most expensive checks are omitted, as they would slow down every journal_file_move_to_object() call too much. With this implementation, the added overhead is small, for example when dumping some journal content to /dev/null (built with -Dbuildtype=debugoptimized -Db_ndebug=true): Performance counter stats for 'builddir.vanilla/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12542,311634 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 100 page-faults:u # 0,006 M/sec 41 786 963 456 cycles:u # 3,332 GHz 105 453 864 770 instructions:u # 2,52 insn per cycle 24 342 227 334 branches:u # 1940,809 M/sec 105 709 217 branch-misses:u # 0,43% of all branches 12,545199291 seconds time elapsed Performance counter stats for 'builddir.patched/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12734,723233 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 693 page-faults:u # 0,006 M/sec 42 661 017 429 cycles:u # 3,350 GHz 107 696 985 865 instructions:u # 2,52 insn per cycle 24 950 526 745 branches:u # 1959,252 M/sec 101 762 806 branch-misses:u # 0,41% of all branches 12,737527327 seconds time elapsed Fixes #6447.
2017-09-19 10:10:49 +02:00
break;
case OBJECT_TAG:
if (le64toh(o->object.size) != sizeof(TagObject))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid object tag size: %" PRIu64 ": %" PRIu64,
le64toh(o->object.size),
offset);
journal: add object sanity check to journal_file_move_to_object() Introduce journal_file_check_object(), which does lightweight object sanity checks, and use it in journal_file_move_to_object(), so that we will catch certain corrupted objects in the journal file. This fixes #6447, where we had only partially written out OBJECT_ENTRY (ObjectHeader written, but rest of object zero bytes), causing "journalctl --list-boots" to fail. $ builddir.vanilla/journalctl --list-boots -D bug6447/ Failed to determine boots: No data available $ builddir.patched/journalctl --list-boots -D bug6447/ -52 22633da1c5374a728d6c215e2c301dc2 Mon 2017-07-10 05:29:21 EEST—Mon 2017-07-10 05:31:51 EEST -51 2253aab9ea7e4a2598f2abda82939eff Mon 2017-07-10 05:32:22 EEST—Mon 2017-07-10 05:36:49 EEST -50 ef0d85d35c74486fa4104f9d6391b6ba Mon 2017-07-10 05:40:33 EEST—Mon 2017-07-10 05:40:40 EEST [...] Note that journal_file_check_object() is similar to journal_file_object_verify(). The most expensive checks are omitted, as they would slow down every journal_file_move_to_object() call too much. With this implementation, the added overhead is small, for example when dumping some journal content to /dev/null (built with -Dbuildtype=debugoptimized -Db_ndebug=true): Performance counter stats for 'builddir.vanilla/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12542,311634 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 100 page-faults:u # 0,006 M/sec 41 786 963 456 cycles:u # 3,332 GHz 105 453 864 770 instructions:u # 2,52 insn per cycle 24 342 227 334 branches:u # 1940,809 M/sec 105 709 217 branch-misses:u # 0,43% of all branches 12,545199291 seconds time elapsed Performance counter stats for 'builddir.patched/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12734,723233 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 693 page-faults:u # 0,006 M/sec 42 661 017 429 cycles:u # 3,350 GHz 107 696 985 865 instructions:u # 2,52 insn per cycle 24 950 526 745 branches:u # 1959,252 M/sec 101 762 806 branch-misses:u # 0,41% of all branches 12,737527327 seconds time elapsed Fixes #6447.
2017-09-19 10:10:49 +02:00
if (!VALID_EPOCH(le64toh(o->tag.epoch)))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid object tag epoch: %" PRIu64 ": %" PRIu64,
le64toh(o->tag.epoch), offset);
journal: add object sanity check to journal_file_move_to_object() Introduce journal_file_check_object(), which does lightweight object sanity checks, and use it in journal_file_move_to_object(), so that we will catch certain corrupted objects in the journal file. This fixes #6447, where we had only partially written out OBJECT_ENTRY (ObjectHeader written, but rest of object zero bytes), causing "journalctl --list-boots" to fail. $ builddir.vanilla/journalctl --list-boots -D bug6447/ Failed to determine boots: No data available $ builddir.patched/journalctl --list-boots -D bug6447/ -52 22633da1c5374a728d6c215e2c301dc2 Mon 2017-07-10 05:29:21 EEST—Mon 2017-07-10 05:31:51 EEST -51 2253aab9ea7e4a2598f2abda82939eff Mon 2017-07-10 05:32:22 EEST—Mon 2017-07-10 05:36:49 EEST -50 ef0d85d35c74486fa4104f9d6391b6ba Mon 2017-07-10 05:40:33 EEST—Mon 2017-07-10 05:40:40 EEST [...] Note that journal_file_check_object() is similar to journal_file_object_verify(). The most expensive checks are omitted, as they would slow down every journal_file_move_to_object() call too much. With this implementation, the added overhead is small, for example when dumping some journal content to /dev/null (built with -Dbuildtype=debugoptimized -Db_ndebug=true): Performance counter stats for 'builddir.vanilla/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12542,311634 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 100 page-faults:u # 0,006 M/sec 41 786 963 456 cycles:u # 3,332 GHz 105 453 864 770 instructions:u # 2,52 insn per cycle 24 342 227 334 branches:u # 1940,809 M/sec 105 709 217 branch-misses:u # 0,43% of all branches 12,545199291 seconds time elapsed Performance counter stats for 'builddir.patched/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12734,723233 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 693 page-faults:u # 0,006 M/sec 42 661 017 429 cycles:u # 3,350 GHz 107 696 985 865 instructions:u # 2,52 insn per cycle 24 950 526 745 branches:u # 1959,252 M/sec 101 762 806 branch-misses:u # 0,41% of all branches 12,737527327 seconds time elapsed Fixes #6447.
2017-09-19 10:10:49 +02:00
break;
}
return 0;
}
2014-12-10 15:18:49 +01:00
int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
int r;
void *t;
size_t tsize;
Object *o;
uint64_t s;
assert(f);
assert(ret);
/* Objects may only be located at multiple of 64 bit */
if (!VALID64(offset))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Attempt to move to object at non-64bit boundary: %" PRIu64,
offset);
/* Object may not be located in the file header */
if (offset < le64toh(f->header->header_size))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Attempt to move to object located in file header: %" PRIu64,
offset);
r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
if (r < 0)
return r;
o = (Object*) t;
s = le64toh(o->object.size);
if (s == 0)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Attempt to move to uninitialized object: %" PRIu64,
offset);
if (s < sizeof(ObjectHeader))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Attempt to move to overly short object: %" PRIu64,
offset);
if (o->object.type <= OBJECT_UNUSED)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Attempt to move to object with invalid type: %" PRIu64,
offset);
if (s < minimum_header_size(o))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Attempt to move to truncated object: %" PRIu64,
offset);
if (type > OBJECT_UNUSED && o->object.type != type)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Attempt to move to object of unexpected type: %" PRIu64,
offset);
if (s > tsize) {
r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
if (r < 0)
return r;
o = (Object*) t;
}
journal: add object sanity check to journal_file_move_to_object() Introduce journal_file_check_object(), which does lightweight object sanity checks, and use it in journal_file_move_to_object(), so that we will catch certain corrupted objects in the journal file. This fixes #6447, where we had only partially written out OBJECT_ENTRY (ObjectHeader written, but rest of object zero bytes), causing "journalctl --list-boots" to fail. $ builddir.vanilla/journalctl --list-boots -D bug6447/ Failed to determine boots: No data available $ builddir.patched/journalctl --list-boots -D bug6447/ -52 22633da1c5374a728d6c215e2c301dc2 Mon 2017-07-10 05:29:21 EEST—Mon 2017-07-10 05:31:51 EEST -51 2253aab9ea7e4a2598f2abda82939eff Mon 2017-07-10 05:32:22 EEST—Mon 2017-07-10 05:36:49 EEST -50 ef0d85d35c74486fa4104f9d6391b6ba Mon 2017-07-10 05:40:33 EEST—Mon 2017-07-10 05:40:40 EEST [...] Note that journal_file_check_object() is similar to journal_file_object_verify(). The most expensive checks are omitted, as they would slow down every journal_file_move_to_object() call too much. With this implementation, the added overhead is small, for example when dumping some journal content to /dev/null (built with -Dbuildtype=debugoptimized -Db_ndebug=true): Performance counter stats for 'builddir.vanilla/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12542,311634 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 100 page-faults:u # 0,006 M/sec 41 786 963 456 cycles:u # 3,332 GHz 105 453 864 770 instructions:u # 2,52 insn per cycle 24 342 227 334 branches:u # 1940,809 M/sec 105 709 217 branch-misses:u # 0,43% of all branches 12,545199291 seconds time elapsed Performance counter stats for 'builddir.patched/journalctl -D 76f4d4c3406945f9a60d3ca8763aa754/': 12734,723233 task-clock:u (msec) # 1,000 CPUs utilized 0 context-switches:u # 0,000 K/sec 0 cpu-migrations:u # 0,000 K/sec 80 693 page-faults:u # 0,006 M/sec 42 661 017 429 cycles:u # 3,350 GHz 107 696 985 865 instructions:u # 2,52 insn per cycle 24 950 526 745 branches:u # 1959,252 M/sec 101 762 806 branch-misses:u # 0,41% of all branches 12,737527327 seconds time elapsed Fixes #6447.
2017-09-19 10:10:49 +02:00
r = journal_file_check_object(f, offset, o);
if (r < 0)
return r;
*ret = o;
return 0;
}
static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
uint64_t r;
assert(f);
assert(f->header);
r = le64toh(f->header->tail_entry_seqnum) + 1;
if (seqnum) {
/* If an external seqnum counter was passed, we update
* both the local and the external one, and set it to
* the maximum of both */
if (*seqnum + 1 > r)
r = *seqnum + 1;
*seqnum = r;
}
f->header->tail_entry_seqnum = htole64(r);
if (f->header->head_entry_seqnum == 0)
f->header->head_entry_seqnum = htole64(r);
return r;
}
2014-12-10 15:18:49 +01:00
int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
int r;
uint64_t p;
Object *tail, *o;
void *t;
assert(f);
assert(f->header);
assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
assert(size >= sizeof(ObjectHeader));
assert(offset);
assert(ret);
r = journal_file_set_online(f);
if (r < 0)
return r;
p = le64toh(f->header->tail_object_offset);
if (p == 0)
p = le64toh(f->header->header_size);
else {
r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
if (r < 0)
return r;
p += ALIGN64(le64toh(tail->object.size));
}
r = journal_file_allocate(f, p, size);
if (r < 0)
return r;
r = journal_file_move_to(f, type, false, p, size, &t, NULL);
if (r < 0)
return r;
o = (Object*) t;
zero(o->object);
o->object.type = type;
o->object.size = htole64(size);
f->header->tail_object_offset = htole64(p);
f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
*ret = o;
*offset = p;
return 0;
}
static int journal_file_setup_data_hash_table(JournalFile *f) {
uint64_t s, p;
Object *o;
int r;
assert(f);
assert(f->header);
/* We estimate that we need 1 hash table entry per 768 bytes
of journal file and we want to make sure we never get
beyond 75% fill level. Calculate the hash table size for
the maximum file size based on these metrics. */
s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
s = DEFAULT_DATA_HASH_TABLE_SIZE;
log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
r = journal_file_append_object(f,
OBJECT_DATA_HASH_TABLE,
offsetof(Object, hash_table.items) + s,
&o, &p);
if (r < 0)
return r;
memzero(o->hash_table.items, s);
f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
f->header->data_hash_table_size = htole64(s);
return 0;
}
static int journal_file_setup_field_hash_table(JournalFile *f) {
uint64_t s, p;
Object *o;
int r;
assert(f);
assert(f->header);
/* We use a fixed size hash table for the fields as this
* number should grow very slowly only */
s = DEFAULT_FIELD_HASH_TABLE_SIZE;
r = journal_file_append_object(f,
OBJECT_FIELD_HASH_TABLE,
offsetof(Object, hash_table.items) + s,
&o, &p);
if (r < 0)
return r;
memzero(o->hash_table.items, s);
f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
f->header->field_hash_table_size = htole64(s);
return 0;
}
int journal_file_map_data_hash_table(JournalFile *f) {
uint64_t s, p;
void *t;
int r;
assert(f);
assert(f->header);
if (f->data_hash_table)
return 0;
p = le64toh(f->header->data_hash_table_offset);
s = le64toh(f->header->data_hash_table_size);
r = journal_file_move_to(f,
OBJECT_DATA_HASH_TABLE,
true,
p, s,
&t, NULL);
if (r < 0)
return r;
f->data_hash_table = t;
return 0;
}
int journal_file_map_field_hash_table(JournalFile *f) {
uint64_t s, p;
void *t;
int r;
assert(f);
assert(f->header);
if (f->field_hash_table)
return 0;
p = le64toh(f->header->field_hash_table_offset);
s = le64toh(f->header->field_hash_table_size);
r = journal_file_move_to(f,
OBJECT_FIELD_HASH_TABLE,
true,
p, s,
&t, NULL);
if (r < 0)
return r;
f->field_hash_table = t;
return 0;
}
static int journal_file_link_field(
JournalFile *f,
Object *o,
uint64_t offset,
uint64_t hash) {
uint64_t p, h, m;
int r;
assert(f);
assert(f->header);
assert(f->field_hash_table);
assert(o);
assert(offset > 0);
if (o->object.type != OBJECT_FIELD)
return -EINVAL;
m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
if (m <= 0)
return -EBADMSG;
/* This might alter the window we are looking at */
o->field.next_hash_offset = o->field.head_data_offset = 0;
h = hash % m;
p = le64toh(f->field_hash_table[h].tail_hash_offset);
if (p == 0)
f->field_hash_table[h].head_hash_offset = htole64(offset);
else {
r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
if (r < 0)
return r;
o->field.next_hash_offset = htole64(offset);
}
f->field_hash_table[h].tail_hash_offset = htole64(offset);
if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
return 0;
}
static int journal_file_link_data(
JournalFile *f,
Object *o,
uint64_t offset,
uint64_t hash) {
uint64_t p, h, m;
int r;
assert(f);
assert(f->header);
assert(f->data_hash_table);
assert(o);
assert(offset > 0);
if (o->object.type != OBJECT_DATA)
return -EINVAL;
m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
if (m <= 0)
return -EBADMSG;
/* This might alter the window we are looking at */
o->data.next_hash_offset = o->data.next_field_offset = 0;
o->data.entry_offset = o->data.entry_array_offset = 0;
o->data.n_entries = 0;
h = hash % m;
p = le64toh(f->data_hash_table[h].tail_hash_offset);
if (p == 0)
/* Only entry in the hash table is easy */
f->data_hash_table[h].head_hash_offset = htole64(offset);
else {
/* Move back to the previous data object, to patch in
* pointer */
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
o->data.next_hash_offset = htole64(offset);
}
f->data_hash_table[h].tail_hash_offset = htole64(offset);
if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
return 0;
}
int journal_file_find_field_object_with_hash(
JournalFile *f,
const void *field, uint64_t size, uint64_t hash,
Object **ret, uint64_t *offset) {
uint64_t p, osize, h, m;
int r;
assert(f);
assert(f->header);
assert(field && size > 0);
/* If the field hash table is empty, we can't find anything */
if (le64toh(f->header->field_hash_table_size) <= 0)
return 0;
/* Map the field hash table, if it isn't mapped yet. */
r = journal_file_map_field_hash_table(f);
if (r < 0)
return r;
osize = offsetof(Object, field.payload) + size;
m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
if (m <= 0)
return -EBADMSG;
h = hash % m;
p = le64toh(f->field_hash_table[h].head_hash_offset);
while (p > 0) {
Object *o;
r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
if (r < 0)
return r;
if (le64toh(o->field.hash) == hash &&
le64toh(o->object.size) == osize &&
memcmp(o->field.payload, field, size) == 0) {
if (ret)
*ret = o;
if (offset)
*offset = p;
return 1;
}
p = le64toh(o->field.next_hash_offset);
}
return 0;
}
int journal_file_find_field_object(
JournalFile *f,
const void *field, uint64_t size,
Object **ret, uint64_t *offset) {
uint64_t hash;
assert(f);
assert(field && size > 0);
hash = hash64(field, size);
return journal_file_find_field_object_with_hash(f,
field, size, hash,
ret, offset);
}
int journal_file_find_data_object_with_hash(
JournalFile *f,
const void *data, uint64_t size, uint64_t hash,
Object **ret, uint64_t *offset) {
uint64_t p, osize, h, m;
int r;
assert(f);
assert(f->header);
assert(data || size == 0);
/* If there's no data hash table, then there's no entry. */
if (le64toh(f->header->data_hash_table_size) <= 0)
return 0;
/* Map the data hash table, if it isn't mapped yet. */
r = journal_file_map_data_hash_table(f);
if (r < 0)
return r;
osize = offsetof(Object, data.payload) + size;
m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
if (m <= 0)
return -EBADMSG;
h = hash % m;
p = le64toh(f->data_hash_table[h].head_hash_offset);
while (p > 0) {
Object *o;
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
if (le64toh(o->data.hash) != hash)
2011-12-27 22:58:20 +01:00
goto next;
if (o->object.flags & OBJECT_COMPRESSION_MASK) {
#if HAVE_XZ || HAVE_LZ4
uint64_t l;
size_t rsize = 0;
l = le64toh(o->object.size);
if (l <= offsetof(Object, data.payload))
return -EBADMSG;
l -= offsetof(Object, data.payload);
r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
if (r < 0)
return r;
2012-01-11 22:44:43 +01:00
if (rsize == size &&
memcmp(f->compress_buffer, data, size) == 0) {
if (ret)
*ret = o;
if (offset)
*offset = p;
return 1;
}
#else
return -EPROTONOSUPPORT;
#endif
} else if (le64toh(o->object.size) == osize &&
memcmp(o->data.payload, data, size) == 0) {
if (ret)
*ret = o;
if (offset)
*offset = p;
return 1;
}
2011-12-27 22:58:20 +01:00
next:
p = le64toh(o->data.next_hash_offset);
}
return 0;
}
int journal_file_find_data_object(
JournalFile *f,
const void *data, uint64_t size,
Object **ret, uint64_t *offset) {
uint64_t hash;
assert(f);
assert(data || size == 0);
hash = hash64(data, size);
return journal_file_find_data_object_with_hash(f,
data, size, hash,
ret, offset);
}
static int journal_file_append_field(
JournalFile *f,
const void *field, uint64_t size,
Object **ret, uint64_t *offset) {
uint64_t hash, p;
uint64_t osize;
Object *o;
int r;
assert(f);
assert(field && size > 0);
hash = hash64(field, size);
r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
if (r < 0)
return r;
else if (r > 0) {
if (ret)
*ret = o;
if (offset)
*offset = p;
return 0;
}
osize = offsetof(Object, field.payload) + size;
r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
2013-10-02 19:36:43 +02:00
if (r < 0)
return r;
o->field.hash = htole64(hash);
memcpy(o->field.payload, field, size);
r = journal_file_link_field(f, o, p, hash);
if (r < 0)
return r;
/* The linking might have altered the window, so let's
* refresh our pointer */
r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
if (r < 0)
return r;
#if HAVE_GCRYPT
r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
if (r < 0)
return r;
#endif
if (ret)
*ret = o;
if (offset)
*offset = p;
return 0;
}
static int journal_file_append_data(
JournalFile *f,
const void *data, uint64_t size,
Object **ret, uint64_t *offset) {
uint64_t hash, p;
uint64_t osize;
Object *o;
int r, compression = 0;
const void *eq;
assert(f);
assert(data || size == 0);
hash = hash64(data, size);
r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
if (r < 0)
return r;
2015-10-24 15:08:15 +02:00
if (r > 0) {
if (ret)
*ret = o;
if (offset)
*offset = p;
return 0;
}
osize = offsetof(Object, data.payload) + size;
r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
if (r < 0)
return r;
o->data.hash = htole64(hash);
#if HAVE_XZ || HAVE_LZ4
if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
size_t rsize = 0;
compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
if (compression >= 0) {
o->object.size = htole64(offsetof(Object, data.payload) + rsize);
o->object.flags |= compression;
log_debug("Compressed data object %"PRIu64" -> %zu using %s",
size, rsize, object_compressed_to_string(compression));
} else
/* Compression didn't work, we don't really care why, let's continue without compression */
compression = 0;
}
#endif
if (compression == 0)
memcpy_safe(o->data.payload, data, size);
r = journal_file_link_data(f, o, p, hash);
if (r < 0)
return r;
#if HAVE_GCRYPT
r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
if (r < 0)
return r;
#endif
/* The linking might have altered the window, so let's
* refresh our pointer */
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
if (!data)
eq = NULL;
else
eq = memchr(data, '=', size);
if (eq && eq > data) {
Object *fo = NULL;
uint64_t fp;
/* Create field object ... */
r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
if (r < 0)
return r;
/* ... and link it in. */
o->data.next_field_offset = fo->field.head_data_offset;
fo->field.head_data_offset = le64toh(p);
}
if (ret)
*ret = o;
if (offset)
*offset = p;
return 0;
}
uint64_t journal_file_entry_n_items(Object *o) {
assert(o);
if (o->object.type != OBJECT_ENTRY)
return 0;
return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
}
2012-08-16 01:51:54 +02:00
uint64_t journal_file_entry_array_n_items(Object *o) {
assert(o);
if (o->object.type != OBJECT_ENTRY_ARRAY)
return 0;
return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
}
uint64_t journal_file_hash_table_n_items(Object *o) {
assert(o);
if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
return 0;
return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
}
static int link_entry_into_array(JournalFile *f,
le64_t *first,
le64_t *idx,
uint64_t p) {
int r;
uint64_t n = 0, ap = 0, q, i, a, hidx;
Object *o;
assert(f);
assert(f->header);
assert(first);
assert(idx);
assert(p > 0);
a = le64toh(*first);
i = hidx = le64toh(*idx);
while (a > 0) {
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
if (r < 0)
return r;
n = journal_file_entry_array_n_items(o);
if (i < n) {
o->entry_array.items[i] = htole64(p);
*idx = htole64(hidx + 1);
return 0;
}
i -= n;
ap = a;
a = le64toh(o->entry_array.next_entry_array_offset);
}
if (hidx > n)
n = (hidx+1) * 2;
else
n = n * 2;
if (n < 4)
n = 4;
r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
&o, &q);
if (r < 0)
return r;
#if HAVE_GCRYPT
r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
if (r < 0)
return r;
#endif
o->entry_array.items[i] = htole64(p);
if (ap == 0)
2012-03-01 18:00:01 +01:00
*first = htole64(q);
else {
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
if (r < 0)
return r;
o->entry_array.next_entry_array_offset = htole64(q);
}
if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
*idx = htole64(hidx + 1);
return 0;
}
static int link_entry_into_array_plus_one(JournalFile *f,
le64_t *extra,
le64_t *first,
le64_t *idx,
uint64_t p) {
int r;
assert(f);
assert(extra);
assert(first);
assert(idx);
assert(p > 0);
if (*idx == 0)
*extra = htole64(p);
else {
le64_t i;
2012-03-01 18:00:01 +01:00
i = htole64(le64toh(*idx) - 1);
r = link_entry_into_array(f, first, &i, p);
if (r < 0)
return r;
}
*idx = htole64(le64toh(*idx) + 1);
return 0;
}
static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
uint64_t p;
int r;
assert(f);
assert(o);
assert(offset > 0);
p = le64toh(o->entry.items[i].object_offset);
if (p == 0)
return -EINVAL;
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
return link_entry_into_array_plus_one(f,
&o->data.entry_offset,
&o->data.entry_array_offset,
&o->data.n_entries,
offset);
}
static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
uint64_t n, i;
int r;
assert(f);
assert(f->header);
assert(o);
assert(offset > 0);
if (o->object.type != OBJECT_ENTRY)
return -EINVAL;
__sync_synchronize();
/* Link up the entry itself */
r = link_entry_into_array(f,
&f->header->entry_array_offset,
&f->header->n_entries,
offset);
if (r < 0)
return r;
/* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
if (f->header->head_entry_realtime == 0)
2011-10-13 05:19:35 +02:00
f->header->head_entry_realtime = o->entry.realtime;
2011-10-13 05:19:35 +02:00
f->header->tail_entry_realtime = o->entry.realtime;
f->header->tail_entry_monotonic = o->entry.monotonic;
/* Link up the items */
n = journal_file_entry_n_items(o);
for (i = 0; i < n; i++) {
r = journal_file_link_entry_item(f, o, offset, i);
if (r < 0)
return r;
}
return 0;
}
static int journal_file_append_entry_internal(
JournalFile *f,
const dual_timestamp *ts,
const sd_id128_t *boot_id,
uint64_t xor_hash,
const EntryItem items[], unsigned n_items,
uint64_t *seqnum,
Object **ret, uint64_t *offset) {
uint64_t np;
uint64_t osize;
Object *o;
int r;
assert(f);
assert(f->header);
assert(items || n_items == 0);
assert(ts);
osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
if (r < 0)
return r;
o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
o->entry.realtime = htole64(ts->realtime);
o->entry.monotonic = htole64(ts->monotonic);
o->entry.xor_hash = htole64(xor_hash);
if (boot_id)
f->header->boot_id = *boot_id;
o->entry.boot_id = f->header->boot_id;
#if HAVE_GCRYPT
r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
if (r < 0)
return r;
#endif
r = journal_file_link_entry(f, o, np);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = np;
return 0;
}
void journal_file_post_change(JournalFile *f) {
assert(f);
if (f->fd < 0)
return;
/* inotify() does not receive IN_MODIFY events from file
* accesses done via mmap(). After each access we hence
* trigger IN_MODIFY by truncating the journal file to its
* current size which triggers IN_MODIFY. */
__sync_synchronize();
if (ftruncate(f->fd, f->last_stat.st_size) < 0)
log_debug_errno(errno, "Failed to truncate file to its own size: %m");
}
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
assert(userdata);
journal_file_post_change(userdata);
return 1;
}
static void schedule_post_change(JournalFile *f) {
uint64_t now;
int r;
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
assert(f);
assert(f->post_change_timer);
r = sd_event_source_get_enabled(f->post_change_timer, NULL);
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
if (r < 0) {
log_debug_errno(r, "Failed to get ftruncate timer state: %m");
goto fail;
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
}
if (r > 0)
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
return;
r = sd_event_now(sd_event_source_get_event(f->post_change_timer), CLOCK_MONOTONIC, &now);
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
if (r < 0) {
log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
goto fail;
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
}
r = sd_event_source_set_time(f->post_change_timer, now + f->post_change_timer_period);
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
if (r < 0) {
log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
goto fail;
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
}
r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
if (r < 0) {
log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
goto fail;
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
}
return;
fail:
/* On failure, let's simply post the change immediately. */
journal_file_post_change(f);
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
}
/* Enable coalesced change posting in a timer on the provided sd_event instance */
int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
_cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
int r;
assert(f);
assert_return(!f->post_change_timer, -EINVAL);
assert(e);
assert(t);
r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
if (r < 0)
return r;
r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
if (r < 0)
return r;
f->post_change_timer = TAKE_PTR(timer);
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
f->post_change_timer_period = t;
return r;
}
2018-09-18 01:39:24 +02:00
static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
}
int journal_file_append_entry(
JournalFile *f,
const dual_timestamp *ts,
const sd_id128_t *boot_id,
const struct iovec iovec[], unsigned n_iovec,
uint64_t *seqnum,
Object **ret, uint64_t *offset) {
unsigned i;
EntryItem *items;
int r;
uint64_t xor_hash = 0;
struct dual_timestamp _ts;
assert(f);
assert(f->header);
assert(iovec || n_iovec == 0);
if (ts) {
if (!VALID_REALTIME(ts->realtime))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid realtime timestamp %" PRIu64 ", refusing entry.",
ts->realtime);
if (!VALID_MONOTONIC(ts->monotonic))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid monotomic timestamp %" PRIu64 ", refusing entry.",
ts->monotonic);
} else {
dual_timestamp_get(&_ts);
ts = &_ts;
}
#if HAVE_GCRYPT
r = journal_file_maybe_append_tag(f, ts->realtime);
if (r < 0)
return r;
#endif
/* alloca() can't take 0, hence let's allocate at least one */
items = newa(EntryItem, MAX(1u, n_iovec));
for (i = 0; i < n_iovec; i++) {
uint64_t p;
Object *o;
r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
if (r < 0)
return r;
xor_hash ^= le64toh(o->data.hash);
items[i].object_offset = htole64(p);
items[i].hash = o->data.hash;
}
/* Order by the position on disk, in order to improve seek
* times for rotating media. */
2018-09-18 01:39:24 +02:00
typesafe_qsort(items, n_iovec, entry_item_cmp);
r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, offset);
/* If the memory mapping triggered a SIGBUS then we return an
* IO error and ignore the error code passed down to us, since
* it is very likely just an effect of a nullified replacement
* mapping page */
if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
r = -EIO;
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
if (f->post_change_timer)
schedule_post_change(f);
else
journal_file_post_change(f);
return r;
}
typedef struct ChainCacheItem {
uint64_t first; /* the array at the beginning of the chain */
uint64_t array; /* the cached array */
uint64_t begin; /* the first item in the cached array */
uint64_t total; /* the total number of items in all arrays before this one in the chain */
uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
} ChainCacheItem;
static void chain_cache_put(
OrderedHashmap *h,
ChainCacheItem *ci,
uint64_t first,
uint64_t array,
uint64_t begin,
uint64_t total,
uint64_t last_index) {
if (!ci) {
/* If the chain item to cache for this chain is the
* first one it's not worth caching anything */
if (array == first)
return;
if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
ci = ordered_hashmap_steal_first(h);
assert(ci);
} else {
ci = new(ChainCacheItem, 1);
if (!ci)
return;
}
ci->first = first;
if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
free(ci);
return;
}
} else
assert(ci->first == first);
ci->array = array;
ci->begin = begin;
ci->total = total;
ci->last_index = last_index;
}
static int generic_array_get(
JournalFile *f,
uint64_t first,
uint64_t i,
Object **ret, uint64_t *offset) {
Object *o;
uint64_t p = 0, a, t = 0;
int r;
ChainCacheItem *ci;
assert(f);
a = first;
/* Try the chain cache first */
ci = ordered_hashmap_get(f->chain_cache, &first);
if (ci && i > ci->total) {
a = ci->array;
i -= ci->total;
t = ci->total;
}
while (a > 0) {
uint64_t k;
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
if (r < 0)
return r;
k = journal_file_entry_array_n_items(o);
if (i < k) {
p = le64toh(o->entry_array.items[i]);
goto found;
}
i -= k;
t += k;
a = le64toh(o->entry_array.next_entry_array_offset);
}
return 0;
found:
/* Let's cache this item for the next invocation */
chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = p;
return 1;
}
static int generic_array_get_plus_one(
JournalFile *f,
uint64_t extra,
uint64_t first,
uint64_t i,
Object **ret, uint64_t *offset) {
Object *o;
assert(f);
if (i == 0) {
int r;
r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = extra;
return 1;
}
return generic_array_get(f, first, i-1, ret, offset);
}
enum {
TEST_FOUND,
TEST_LEFT,
TEST_RIGHT
};
static int generic_array_bisect(
JournalFile *f,
uint64_t first,
uint64_t n,
uint64_t needle,
int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
direction_t direction,
Object **ret,
uint64_t *offset,
uint64_t *idx) {
uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
bool subtract_one = false;
Object *o, *array = NULL;
int r;
ChainCacheItem *ci;
assert(f);
assert(test_object);
/* Start with the first array in the chain */
a = first;
ci = ordered_hashmap_get(f->chain_cache, &first);
journal-file: we can't use a chain cache entry if we don't know where it starts (#8542) It might happen that we try to bisect through a chain of offset arrays in the journal whose last element was just allocated but no item yet written to. In that case that array will be all NUL, but it might still end up in our array chain cache. If it does, we cannot use it for bisection, since for bisection we need to know the value of the first entry in that array, but if it's uninitialized it does not have a first value. Hence, as a simple fix, in this unlikely case, simply ignore the chain cache. This is supposed to fix the issue pointed out in #8432, but in a more permissive way, as this case isn't strictly a badly formatted journal but actually a valid state (though one within a very short time window), and we should make the best of it, and handle it gracefully. Background: in each journal file entries are linked up in large arrays of offsets. In each array the entries are strictly ordered by the offsets of the entries, which permits search by bisection. These arrays are allocated with a fixed size and then filled up as entries are added to the journal file. If an array is fully filled up, a new array (double in size as the old one) is appended to the journal file, and linked up. This means, the journal file will contain a series of chained up arrays, each time doubling in size, and strictly ordered. When looking for an entry we maintain a "chain cache", which allows us to bypass traversing the chain in full if we look for entries close to each other in a short time. With the fix above we make sure we don't erroneously use a chain cache item that doesn't carry enough information for this bisection to work. Original issue identified (with patch) by @Kxuan. Replaces: #8432
2018-03-27 09:36:49 +02:00
if (ci && n > ci->total && ci->begin != 0) {
/* Ah, we have iterated this bisection array chain
* previously! Let's see if we can skip ahead in the
* chain, as far as the last time. But we can't jump
* backwards in the chain, so let's check that
* first. */
r = test_object(f, ci->begin, needle);
if (r < 0)
return r;
if (r == TEST_LEFT) {
/* OK, what we are looking for is right of the
* begin of this EntryArray, so let's jump
* straight to previously cached array in the
* chain */
a = ci->array;
n -= ci->total;
t = ci->total;
last_index = ci->last_index;
}
}
while (a > 0) {
uint64_t left, right, k, lp;
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
if (r < 0)
return r;
k = journal_file_entry_array_n_items(array);
right = MIN(k, n);
if (right <= 0)
return 0;
i = right - 1;
lp = p = le64toh(array->entry_array.items[i]);
if (p <= 0)
r = -EBADMSG;
else
r = test_object(f, p, needle);
if (r == -EBADMSG) {
log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
n = i;
continue;
}
if (r < 0)
return r;
if (r == TEST_FOUND)
r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
if (r == TEST_RIGHT) {
left = 0;
right -= 1;
if (last_index != (uint64_t) -1) {
assert(last_index <= right);
/* If we cached the last index we
* looked at, let's try to not to jump
* too wildly around and see if we can
* limit the range to look at early to
* the immediate neighbors of the last
* index we looked at. */
if (last_index > 0) {
uint64_t x = last_index - 1;
p = le64toh(array->entry_array.items[x]);
if (p <= 0)
return -EBADMSG;
r = test_object(f, p, needle);
if (r < 0)
return r;
if (r == TEST_FOUND)
r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
if (r == TEST_RIGHT)
right = x;
else
left = x + 1;
}
if (last_index < right) {
uint64_t y = last_index + 1;
p = le64toh(array->entry_array.items[y]);
if (p <= 0)
return -EBADMSG;
r = test_object(f, p, needle);
if (r < 0)
return r;
if (r == TEST_FOUND)
r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
if (r == TEST_RIGHT)
right = y;
else
left = y + 1;
}
}
for (;;) {
if (left == right) {
if (direction == DIRECTION_UP)
subtract_one = true;
i = left;
goto found;
}
assert(left < right);
i = (left + right) / 2;
p = le64toh(array->entry_array.items[i]);
if (p <= 0)
r = -EBADMSG;
else
r = test_object(f, p, needle);
if (r == -EBADMSG) {
log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
right = n = i;
continue;
}
if (r < 0)
return r;
if (r == TEST_FOUND)
r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
if (r == TEST_RIGHT)
right = i;
else
left = i + 1;
}
}
if (k >= n) {
if (direction == DIRECTION_UP) {
i = n;
subtract_one = true;
goto found;
}
return 0;
}
last_p = lp;
n -= k;
t += k;
last_index = (uint64_t) -1;
a = le64toh(array->entry_array.next_entry_array_offset);
}
return 0;
found:
if (subtract_one && t == 0 && i == 0)
return 0;
/* Let's cache this item for the next invocation */
chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
if (subtract_one && i == 0)
p = last_p;
else if (subtract_one)
p = le64toh(array->entry_array.items[i-1]);
else
p = le64toh(array->entry_array.items[i]);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = p;
if (idx)
*idx = t + i + (subtract_one ? -1 : 0);
return 1;
}
static int generic_array_bisect_plus_one(
JournalFile *f,
uint64_t extra,
uint64_t first,
uint64_t n,
uint64_t needle,
int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
direction_t direction,
Object **ret,
uint64_t *offset,
uint64_t *idx) {
int r;
bool step_back = false;
Object *o;
assert(f);
assert(test_object);
if (n <= 0)
return 0;
/* This bisects the array in object 'first', but first checks
* an extra */
r = test_object(f, extra, needle);
if (r < 0)
return r;
if (r == TEST_FOUND)
r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
/* if we are looking with DIRECTION_UP then we need to first
see if in the actual array there is a matching entry, and
return the last one of that. But if there isn't any we need
to return this one. Hence remember this, and return it
below. */
if (r == TEST_LEFT)
step_back = direction == DIRECTION_UP;
if (r == TEST_RIGHT) {
if (direction == DIRECTION_DOWN)
goto found;
else
return 0;
}
r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
if (r == 0 && step_back)
goto found;
2012-07-12 17:36:24 +02:00
if (r > 0 && idx)
(*idx)++;
return r;
found:
r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
if (r < 0)
return r;
if (ret)
*ret = o;
if (offset)
*offset = extra;
if (idx)
*idx = 0;
return 1;
}
_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
assert(f);
assert(p > 0);
if (p == needle)
return TEST_FOUND;
else if (p < needle)
return TEST_LEFT;
else
return TEST_RIGHT;
}
static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
Object *o;
int r;
assert(f);
assert(p > 0);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
if (le64toh(o->entry.seqnum) == needle)
return TEST_FOUND;
else if (le64toh(o->entry.seqnum) < needle)
return TEST_LEFT;
else
return TEST_RIGHT;
}
int journal_file_move_to_entry_by_seqnum(
JournalFile *f,
uint64_t seqnum,
direction_t direction,
Object **ret,
uint64_t *offset) {
assert(f);
assert(f->header);
return generic_array_bisect(f,
le64toh(f->header->entry_array_offset),
le64toh(f->header->n_entries),
seqnum,
test_object_seqnum,
direction,
ret, offset, NULL);
}
static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
Object *o;
int r;
assert(f);
assert(p > 0);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
if (le64toh(o->entry.realtime) == needle)
return TEST_FOUND;
else if (le64toh(o->entry.realtime) < needle)
return TEST_LEFT;
else
return TEST_RIGHT;
}
int journal_file_move_to_entry_by_realtime(
JournalFile *f,
uint64_t realtime,
direction_t direction,
Object **ret,
uint64_t *offset) {
assert(f);
assert(f->header);
return generic_array_bisect(f,
le64toh(f->header->entry_array_offset),
le64toh(f->header->n_entries),
realtime,
test_object_realtime,
direction,
ret, offset, NULL);
}
static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
Object *o;
int r;
assert(f);
assert(p > 0);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
if (le64toh(o->entry.monotonic) == needle)
return TEST_FOUND;
else if (le64toh(o->entry.monotonic) < needle)
return TEST_LEFT;
else
return TEST_RIGHT;
}
static int find_data_object_by_boot_id(
JournalFile *f,
sd_id128_t boot_id,
Object **o,
uint64_t *b) {
char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
sd_id128_to_string(boot_id, t + 9);
return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
}
int journal_file_move_to_entry_by_monotonic(
JournalFile *f,
sd_id128_t boot_id,
uint64_t monotonic,
direction_t direction,
Object **ret,
uint64_t *offset) {
Object *o;
int r;
assert(f);
r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
if (r < 0)
return r;
if (r == 0)
return -ENOENT;
return generic_array_bisect_plus_one(f,
le64toh(o->data.entry_offset),
le64toh(o->data.entry_array_offset),
le64toh(o->data.n_entries),
monotonic,
test_object_monotonic,
direction,
ret, offset, NULL);
}
void journal_file_reset_location(JournalFile *f) {
f->location_type = LOCATION_HEAD;
f->current_offset = 0;
f->current_seqnum = 0;
f->current_realtime = 0;
f->current_monotonic = 0;
zero(f->current_boot_id);
f->current_xor_hash = 0;
}
void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
f->location_type = LOCATION_SEEK;
f->current_offset = offset;
f->current_seqnum = le64toh(o->entry.seqnum);
f->current_realtime = le64toh(o->entry.realtime);
f->current_monotonic = le64toh(o->entry.monotonic);
f->current_boot_id = o->entry.boot_id;
f->current_xor_hash = le64toh(o->entry.xor_hash);
}
int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
int r;
assert(af);
assert(af->header);
assert(bf);
assert(bf->header);
assert(af->location_type == LOCATION_SEEK);
assert(bf->location_type == LOCATION_SEEK);
/* If contents and timestamps match, these entries are
* identical, even if the seqnum does not match */
if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
af->current_monotonic == bf->current_monotonic &&
af->current_realtime == bf->current_realtime &&
af->current_xor_hash == bf->current_xor_hash)
return 0;
if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
/* If this is from the same seqnum source, compare
* seqnums */
r = CMP(af->current_seqnum, bf->current_seqnum);
if (r != 0)
return r;
/* Wow! This is weird, different data but the same
* seqnums? Something is borked, but let's make the
* best of it and compare by time. */
}
if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
/* If the boot id matches, compare monotonic time */
r = CMP(af->current_monotonic, bf->current_monotonic);
if (r != 0)
return r;
}
/* Otherwise, compare UTC time */
r = CMP(af->current_realtime, bf->current_realtime);
if (r != 0)
return r;
/* Finally, compare by contents */
return CMP(af->current_xor_hash, bf->current_xor_hash);
}
static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
/* Increase or decrease the specified index, in the right direction. */
if (direction == DIRECTION_DOWN) {
if (*i >= n - 1)
return 0;
(*i) ++;
} else {
if (*i <= 0)
return 0;
(*i) --;
}
return 1;
}
static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
/* Consider it an error if any of the two offsets is uninitialized */
if (old_offset == 0 || new_offset == 0)
return false;
/* If we go down, the new offset must be larger than the old one. */
return direction == DIRECTION_DOWN ?
new_offset > old_offset :
new_offset < old_offset;
}
int journal_file_next_entry(
JournalFile *f,
uint64_t p,
direction_t direction,
Object **ret, uint64_t *offset) {
uint64_t i, n, ofs;
int r;
assert(f);
assert(f->header);
n = le64toh(f->header->n_entries);
if (n <= 0)
return 0;
if (p == 0)
i = direction == DIRECTION_DOWN ? 0 : n - 1;
else {
r = generic_array_bisect(f,
le64toh(f->header->entry_array_offset),
le64toh(f->header->n_entries),
p,
test_object_offset,
DIRECTION_DOWN,
NULL, NULL,
&i);
if (r <= 0)
return r;
r = bump_array_index(&i, direction, n);
if (r <= 0)
return r;
}
/* And jump to it */
for (;;) {
r = generic_array_get(f,
le64toh(f->header->entry_array_offset),
i,
ret, &ofs);
if (r > 0)
break;
if (r != -EBADMSG)
return r;
/* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
* the next one might work for us instead. */
log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
r = bump_array_index(&i, direction, n);
if (r <= 0)
return r;
}
/* Ensure our array is properly ordered. */
if (p > 0 && !check_properly_ordered(ofs, p, direction))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"%s: entry array not properly ordered at entry %" PRIu64,
f->path, i);
if (offset)
*offset = ofs;
return 1;
}
int journal_file_next_entry_for_data(
JournalFile *f,
Object *o, uint64_t p,
uint64_t data_offset,
direction_t direction,
Object **ret, uint64_t *offset) {
uint64_t i, n, ofs;
Object *d;
int r;
assert(f);
assert(p > 0 || !o);
r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2011-12-20 00:38:14 +01:00
if (r < 0)
return r;
n = le64toh(d->data.n_entries);
if (n <= 0)
return n;
if (!o)
i = direction == DIRECTION_DOWN ? 0 : n - 1;
else {
if (o->object.type != OBJECT_ENTRY)
return -EINVAL;
r = generic_array_bisect_plus_one(f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
le64toh(d->data.n_entries),
p,
test_object_offset,
DIRECTION_DOWN,
NULL, NULL,
&i);
if (r <= 0)
return r;
r = bump_array_index(&i, direction, n);
if (r <= 0)
return r;
}
for (;;) {
r = generic_array_get_plus_one(f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
i,
ret, &ofs);
if (r > 0)
break;
if (r != -EBADMSG)
return r;
log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
r = bump_array_index(&i, direction, n);
if (r <= 0)
return r;
}
/* Ensure our array is properly ordered. */
if (p > 0 && check_properly_ordered(ofs, p, direction))
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"%s data entry array not properly ordered at entry %" PRIu64,
f->path, i);
if (offset)
*offset = ofs;
return 1;
}
int journal_file_move_to_entry_by_offset_for_data(
JournalFile *f,
uint64_t data_offset,
uint64_t p,
direction_t direction,
Object **ret, uint64_t *offset) {
int r;
Object *d;
assert(f);
r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
if (r < 0)
return r;
return generic_array_bisect_plus_one(f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
le64toh(d->data.n_entries),
p,
test_object_offset,
direction,
ret, offset, NULL);
}
int journal_file_move_to_entry_by_monotonic_for_data(
JournalFile *f,
uint64_t data_offset,
sd_id128_t boot_id,
uint64_t monotonic,
direction_t direction,
Object **ret, uint64_t *offset) {
Object *o, *d;
int r;
uint64_t b, z;
assert(f);
/* First, seek by time */
r = find_data_object_by_boot_id(f, boot_id, &o, &b);
if (r < 0)
return r;
if (r == 0)
return -ENOENT;
r = generic_array_bisect_plus_one(f,
le64toh(o->data.entry_offset),
le64toh(o->data.entry_array_offset),
le64toh(o->data.n_entries),
monotonic,
test_object_monotonic,
direction,
NULL, &z, NULL);
if (r <= 0)
return r;
/* And now, continue seeking until we find an entry that
* exists in both bisection arrays */
for (;;) {
Object *qo;
uint64_t p, q;
r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
if (r < 0)
return r;
r = generic_array_bisect_plus_one(f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
le64toh(d->data.n_entries),
z,
test_object_offset,
direction,
NULL, &p, NULL);
if (r <= 0)
return r;
r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
if (r < 0)
return r;
r = generic_array_bisect_plus_one(f,
le64toh(o->data.entry_offset),
le64toh(o->data.entry_array_offset),
le64toh(o->data.n_entries),
p,
test_object_offset,
direction,
&qo, &q, NULL);
if (r <= 0)
return r;
if (p == q) {
if (ret)
*ret = qo;
if (offset)
*offset = q;
return 1;
}
z = q;
}
}
int journal_file_move_to_entry_by_seqnum_for_data(
JournalFile *f,
uint64_t data_offset,
uint64_t seqnum,
direction_t direction,
Object **ret, uint64_t *offset) {
Object *d;
int r;
assert(f);
r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
if (r < 0)
return r;
return generic_array_bisect_plus_one(f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
le64toh(d->data.n_entries),
seqnum,
test_object_seqnum,
direction,
ret, offset, NULL);
}
int journal_file_move_to_entry_by_realtime_for_data(
JournalFile *f,
uint64_t data_offset,
uint64_t realtime,
direction_t direction,
Object **ret, uint64_t *offset) {
Object *d;
int r;
assert(f);
r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
if (r < 0)
return r;
return generic_array_bisect_plus_one(f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
le64toh(d->data.n_entries),
realtime,
test_object_realtime,
direction,
ret, offset, NULL);
}
2012-08-16 01:51:54 +02:00
void journal_file_dump(JournalFile *f) {
Object *o;
int r;
2012-08-16 01:51:54 +02:00
uint64_t p;
assert(f);
assert(f->header);
2012-08-16 01:51:54 +02:00
journal_file_print_header(f);
2012-08-16 01:51:54 +02:00
p = le64toh(f->header->header_size);
while (p != 0) {
r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2012-08-16 01:51:54 +02:00
if (r < 0)
goto fail;
2012-08-16 01:51:54 +02:00
switch (o->object.type) {
2012-08-16 01:51:54 +02:00
case OBJECT_UNUSED:
printf("Type: OBJECT_UNUSED\n");
break;
2012-08-16 01:51:54 +02:00
case OBJECT_DATA:
printf("Type: OBJECT_DATA\n");
break;
case OBJECT_FIELD:
printf("Type: OBJECT_FIELD\n");
break;
2012-08-16 01:51:54 +02:00
case OBJECT_ENTRY:
printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
le64toh(o->entry.seqnum),
le64toh(o->entry.monotonic),
le64toh(o->entry.realtime));
2012-08-16 01:51:54 +02:00
break;
2012-08-16 01:51:54 +02:00
case OBJECT_FIELD_HASH_TABLE:
printf("Type: OBJECT_FIELD_HASH_TABLE\n");
break;
2012-08-16 01:51:54 +02:00
case OBJECT_DATA_HASH_TABLE:
printf("Type: OBJECT_DATA_HASH_TABLE\n");
break;
2012-08-16 01:51:54 +02:00
case OBJECT_ENTRY_ARRAY:
printf("Type: OBJECT_ENTRY_ARRAY\n");
break;
2012-08-16 01:51:54 +02:00
case OBJECT_TAG:
printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
le64toh(o->tag.seqnum),
le64toh(o->tag.epoch));
2012-08-16 01:51:54 +02:00
break;
default:
printf("Type: unknown (%i)\n", o->object.type);
break;
2012-08-16 01:51:54 +02:00
}
if (o->object.flags & OBJECT_COMPRESSION_MASK)
printf("Flags: %s\n",
object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2012-08-16 01:51:54 +02:00
if (p == le64toh(f->header->tail_object_offset))
p = 0;
else
p = p + ALIGN64(le64toh(o->object.size));
}
2012-08-16 01:51:54 +02:00
return;
fail:
log_error("File corrupt");
}
static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
const char *x;
x = format_timestamp(buf, l, t);
if (x)
return x;
return " --- ";
}
2012-08-16 01:51:54 +02:00
void journal_file_print_header(JournalFile *f) {
char a[33], b[33], c[33], d[33];
char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
struct stat st;
char bytes[FORMAT_BYTES_MAX];
assert(f);
assert(f->header);
2012-08-16 01:51:54 +02:00
printf("File Path: %s\n"
"File ID: %s\n"
"Machine ID: %s\n"
"Boot ID: %s\n"
"Sequential Number ID: %s\n"
"State: %s\n"
"Compatible Flags:%s%s\n"
"Incompatible Flags:%s%s%s\n"
"Header size: %"PRIu64"\n"
"Arena size: %"PRIu64"\n"
"Data Hash Table Size: %"PRIu64"\n"
"Field Hash Table Size: %"PRIu64"\n"
2012-08-16 01:51:54 +02:00
"Rotate Suggested: %s\n"
"Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
"Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
"Head Realtime Timestamp: %s (%"PRIx64")\n"
"Tail Realtime Timestamp: %s (%"PRIx64")\n"
"Tail Monotonic Timestamp: %s (%"PRIx64")\n"
"Objects: %"PRIu64"\n"
"Entry Objects: %"PRIu64"\n",
2012-08-16 01:51:54 +02:00
f->path,
sd_id128_to_string(f->header->file_id, a),
sd_id128_to_string(f->header->machine_id, b),
sd_id128_to_string(f->header->boot_id, c),
sd_id128_to_string(f->header->seqnum_id, d),
f->header->state == STATE_OFFLINE ? "OFFLINE" :
f->header->state == STATE_ONLINE ? "ONLINE" :
f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
(le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
(le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
le64toh(f->header->header_size),
le64toh(f->header->arena_size),
le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
yes_no(journal_file_rotate_suggested(f, 0)),
le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
le64toh(f->header->n_objects),
le64toh(f->header->n_entries));
2012-08-16 01:51:54 +02:00
if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
printf("Data Objects: %"PRIu64"\n"
2012-08-16 01:51:54 +02:00
"Data Hash Table Fill: %.1f%%\n",
le64toh(f->header->n_data),
2012-08-16 01:51:54 +02:00
100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2012-08-16 01:51:54 +02:00
if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
printf("Field Objects: %"PRIu64"\n"
2012-08-16 01:51:54 +02:00
"Field Hash Table Fill: %.1f%%\n",
le64toh(f->header->n_fields),
2012-08-16 01:51:54 +02:00
100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
printf("Tag Objects: %"PRIu64"\n",
le64toh(f->header->n_tags));
if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
printf("Entry Array Objects: %"PRIu64"\n",
le64toh(f->header->n_entry_arrays));
if (fstat(f->fd, &st) >= 0)
printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
}
static int journal_file_warn_btrfs(JournalFile *f) {
unsigned attrs;
int r;
assert(f);
/* Before we write anything, check if the COW logic is turned
* off on btrfs. Given our write pattern that is quite
* unfriendly to COW file systems this should greatly improve
* performance on COW file systems, such as btrfs, at the
* expense of data integrity features (which shouldn't be too
* bad, given that we do our own checksumming). */
r = btrfs_is_filesystem(f->fd);
if (r < 0)
return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
if (!r)
return 0;
r = read_attr_fd(f->fd, &attrs);
if (r < 0)
return log_warning_errno(r, "Failed to read file attributes: %m");
if (attrs & FS_NOCOW_FL) {
log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
return 0;
}
log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
"This is likely to slow down journal access substantially, please consider turning "
"off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
return 1;
}
2012-08-16 01:51:54 +02:00
int journal_file_open(
int fd,
2012-08-16 01:51:54 +02:00
const char *fname,
int flags,
mode_t mode,
bool compress,
uint64_t compress_threshold_bytes,
bool seal,
2012-08-16 01:51:54 +02:00
JournalMetrics *metrics,
MMapCache *mmap_cache,
Set *deferred_closes,
2012-08-16 01:51:54 +02:00
JournalFile *template,
JournalFile **ret) {
bool newly_created = false;
2012-08-16 01:51:54 +02:00
JournalFile *f;
void *h;
2012-08-16 01:51:54 +02:00
int r;
char bytes[FORMAT_BYTES_MAX];
assert(ret);
assert(fd >= 0 || fname);
if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
2012-08-16 01:51:54 +02:00
return -EINVAL;
if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
return -EINVAL;
f = new(JournalFile, 1);
2012-08-16 01:51:54 +02:00
if (!f)
return -ENOMEM;
*f = (JournalFile) {
.fd = fd,
.mode = mode,
.flags = flags,
.prot = prot_from_flags(flags),
.writable = (flags & O_ACCMODE) != O_RDONLY,
#if HAVE_LZ4
.compress_lz4 = compress,
#elif HAVE_XZ
.compress_xz = compress,
#endif
.compress_threshold_bytes = compress_threshold_bytes == (uint64_t) -1 ?
DEFAULT_COMPRESS_THRESHOLD :
MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
#if HAVE_GCRYPT
.seal = seal,
#endif
};
log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
format_bytes(bytes, sizeof(bytes), f->compress_threshold_bytes));
2012-08-16 01:51:54 +02:00
if (mmap_cache)
f->mmap = mmap_cache_ref(mmap_cache);
else {
2012-08-18 01:46:20 +02:00
f->mmap = mmap_cache_new();
2012-08-16 01:51:54 +02:00
if (!f->mmap) {
r = -ENOMEM;
goto fail;
}
}
if (fname) {
f->path = strdup(fname);
if (!f->path) {
r = -ENOMEM;
goto fail;
}
} else {
assert(fd >= 0);
/* If we don't know the path, fill in something explanatory and vaguely useful */
if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
r = -ENOMEM;
goto fail;
}
2012-08-16 01:51:54 +02:00
}
f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
if (!f->chain_cache) {
r = -ENOMEM;
goto fail;
}
2012-08-16 01:51:54 +02:00
if (f->fd < 0) {
/* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
* or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
* it doesn't hurt in that case. */
f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
if (f->fd < 0) {
r = -errno;
goto fail;
}
/* fds we opened here by us should also be closed by us. */
f->close_fd = true;
r = fd_nonblock(f->fd, false);
if (r < 0)
goto fail;
}
f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
if (!f->cache_fd) {
r = -ENOMEM;
goto fail;
}
r = journal_file_fstat(f);
if (r < 0)
2012-08-16 01:51:54 +02:00
goto fail;
2012-08-16 01:51:54 +02:00
if (f->last_stat.st_size == 0 && f->writable) {
(void) journal_file_warn_btrfs(f);
/* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
* file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
* file systems maintain for each file, but the API to query this is very new, hence let's emulate this
* via extended attributes. If extended attributes are not supported we'll just skip this, and rely
* solely on mtime/atime/ctime of the file. */
(void) fd_setcrtime(f->fd, 0);
#if HAVE_GCRYPT
2012-08-16 01:51:54 +02:00
/* Try to load the FSPRG state, and if we can't, then
* just don't do sealing */
if (f->seal) {
r = journal_file_fss_load(f);
if (r < 0)
f->seal = false;
}
#endif
2012-08-16 01:51:54 +02:00
r = journal_file_init_header(f, template);
if (r < 0)
goto fail;
r = journal_file_fstat(f);
if (r < 0)
2012-08-16 01:51:54 +02:00
goto fail;
newly_created = true;
2012-08-16 01:51:54 +02:00
}
2012-08-16 01:51:54 +02:00
if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
r = -ENODATA;
2012-08-16 01:51:54 +02:00
goto fail;
}
r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
if (r < 0)
2012-08-16 01:51:54 +02:00
goto fail;
f->header = h;
2012-08-16 01:51:54 +02:00
if (!newly_created) {
set_clear_with_destructor(deferred_closes, journal_file_close);
2012-08-16 01:51:54 +02:00
r = journal_file_verify_header(f);
if (r < 0)
goto fail;
}
#if HAVE_GCRYPT
2012-08-16 01:51:54 +02:00
if (!newly_created && f->writable) {
r = journal_file_fss_load(f);
2012-08-16 01:51:54 +02:00
if (r < 0)
goto fail;
}
#endif
if (f->writable) {
if (metrics) {
journal_default_metrics(metrics, f->fd);
f->metrics = *metrics;
} else if (template)
f->metrics = template->metrics;
r = journal_file_refresh_header(f);
if (r < 0)
goto fail;
}
#if HAVE_GCRYPT
r = journal_file_hmac_setup(f);
if (r < 0)
goto fail;
#endif
if (newly_created) {
r = journal_file_setup_field_hash_table(f);
if (r < 0)
goto fail;
r = journal_file_setup_data_hash_table(f);
if (r < 0)
goto fail;
#if HAVE_GCRYPT
r = journal_file_append_first_tag(f);
if (r < 0)
goto fail;
#endif
}
if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
r = -EIO;
goto fail;
}
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
if (template && template->post_change_timer) {
r = journal_file_enable_post_change_timer(
f,
sd_event_source_get_event(template->post_change_timer),
template->post_change_timer_period);
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
if (r < 0)
goto fail;
}
2016-05-04 11:26:17 +02:00
/* The file is opened now successfully, thus we take possession of any passed in fd. */
f->close_fd = true;
*ret = f;
return 0;
fail:
if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
r = -EIO;
(void) journal_file_close(f);
return r;
}
2011-10-13 05:19:35 +02:00
int journal_file_archive(JournalFile *f) {
_cleanup_free_ char *p = NULL;
2011-10-13 05:19:35 +02:00
assert(f);
if (!f->writable)
2011-10-13 05:19:35 +02:00
return -EINVAL;
/* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
2017-02-24 18:14:02 +01:00
* rotation, since we don't know the actual path, and couldn't rename the file hence. */
if (path_startswith(f->path, "/proc/self/fd"))
return -EINVAL;
if (!endswith(f->path, ".journal"))
2011-10-13 05:19:35 +02:00
return -EINVAL;
if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
(int) strlen(f->path) - 8, f->path,
SD_ID128_FORMAT_VAL(f->header->seqnum_id),
le64toh(f->header->head_entry_seqnum),
le64toh(f->header->head_entry_realtime)) < 0)
2011-10-13 05:19:35 +02:00
return -ENOMEM;
/* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
* ignore that case. */
if (rename(f->path, p) < 0 && errno != ENOENT)
2011-10-13 05:19:35 +02:00
return -errno;
/* Sync the rename to disk */
(void) fsync_directory_of_file(f->fd);
/* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
* to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
* which would result in the rotated journal never getting fsync() called before closing. Now we simply queue
* the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
* occurs. */
f->archive = true;
/* Currently, btrfs is not very good with out write patterns and fragments heavily. Let's defrag our journal
* files when we archive them */
f->defrag_on_close = true;
return 0;
}
JournalFile* journal_initiate_close(
JournalFile *f,
Set *deferred_closes) {
int r;
assert(f);
if (deferred_closes) {
2011-10-13 05:19:35 +02:00
r = set_put(deferred_closes, f);
if (r < 0)
log_debug_errno(r, "Failed to add file to deferred close set, closing immediately.");
else {
(void) journal_file_set_offline(f, false);
return NULL;
}
}
return journal_file_close(f);
}
int journal_file_rotate(
JournalFile **f,
bool compress,
uint64_t compress_threshold_bytes,
bool seal,
Set *deferred_closes) {
JournalFile *new_file = NULL;
int r;
assert(f);
assert(*f);
r = journal_file_archive(*f);
if (r < 0)
return r;
r = journal_file_open(
-1,
(*f)->path,
(*f)->flags,
(*f)->mode,
compress,
compress_threshold_bytes,
seal,
NULL, /* metrics */
(*f)->mmap,
deferred_closes,
*f, /* template */
&new_file);
journal_initiate_close(*f, deferred_closes);
2011-10-13 05:19:35 +02:00
*f = new_file;
2011-10-13 05:19:35 +02:00
return r;
}
int journal_file_dispose(int dir_fd, const char *fname) {
_cleanup_free_ char *p = NULL;
_cleanup_close_ int fd = -1;
assert(fname);
/* Renames a journal file to *.journal~, i.e. to mark it as corruped or otherwise uncleanly shutdown. Note that
* this is done without looking into the file or changing any of its contents. The idea is that this is called
* whenever something is suspicious and we want to move the file away and make clear that it is not accessed
* for writing anymore. */
if (!endswith(fname, ".journal"))
return -EINVAL;
if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
(int) strlen(fname) - 8, fname,
now(CLOCK_REALTIME),
random_u64()) < 0)
return -ENOMEM;
if (renameat(dir_fd, fname, dir_fd, p) < 0)
return -errno;
/* btrfs doesn't cope well with our write pattern and fragments heavily. Let's defrag all files we rotate */
fd = openat(dir_fd, p, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
if (fd < 0)
log_debug_errno(errno, "Failed to open file for defragmentation/FS_NOCOW_FL, ignoring: %m");
else {
(void) chattr_fd(fd, 0, FS_NOCOW_FL, NULL);
(void) btrfs_defrag_fd(fd);
}
return 0;
}
int journal_file_open_reliably(
const char *fname,
int flags,
mode_t mode,
bool compress,
uint64_t compress_threshold_bytes,
bool seal,
JournalMetrics *metrics,
MMapCache *mmap_cache,
Set *deferred_closes,
JournalFile *template,
JournalFile **ret) {
int r;
r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
deferred_closes, template, ret);
if (!IN_SET(r,
-EBADMSG, /* Corrupted */
-ENODATA, /* Truncated */
-EHOSTDOWN, /* Other machine */
-EPROTONOSUPPORT, /* Incompatible feature */
-EBUSY, /* Unclean shutdown */
-ESHUTDOWN, /* Already archived */
-EIO, /* IO error, including SIGBUS on mmap */
-EIDRM, /* File has been deleted */
-ETXTBSY)) /* File is from the future */
return r;
if ((flags & O_ACCMODE) == O_RDONLY)
return r;
if (!(flags & O_CREAT))
return r;
if (!endswith(fname, ".journal"))
return r;
2012-03-15 01:35:03 +01:00
/* The file is corrupted. Rotate it away and try it again (but only once) */
log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
r = journal_file_dispose(AT_FDCWD, fname);
if (r < 0)
return r;
return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
deferred_closes, template, ret);
}
int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
uint64_t i, n;
uint64_t q, xor_hash = 0;
int r;
EntryItem *items;
dual_timestamp ts;
const sd_id128_t *boot_id;
assert(from);
assert(to);
assert(o);
assert(p);
if (!to->writable)
return -EPERM;
ts.monotonic = le64toh(o->entry.monotonic);
ts.realtime = le64toh(o->entry.realtime);
boot_id = &o->entry.boot_id;
n = journal_file_entry_n_items(o);
/* alloca() can't take 0, hence let's allocate at least one */
items = newa(EntryItem, MAX(1u, n));
for (i = 0; i < n; i++) {
uint64_t l, h;
le64_t le_hash;
size_t t;
void *data;
Object *u;
q = le64toh(o->entry.items[i].object_offset);
le_hash = o->entry.items[i].hash;
r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
if (r < 0)
return r;
if (le_hash != o->data.hash)
return -EBADMSG;
l = le64toh(o->object.size) - offsetof(Object, data.payload);
t = (size_t) l;
/* We hit the limit on 32bit machines */
if ((uint64_t) t != l)
return -E2BIG;
if (o->object.flags & OBJECT_COMPRESSION_MASK) {
#if HAVE_XZ || HAVE_LZ4
size_t rsize = 0;
r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
if (r < 0)
return r;
data = from->compress_buffer;
l = rsize;
#else
return -EPROTONOSUPPORT;
#endif
} else
data = o->data.payload;
r = journal_file_append_data(to, data, l, &u, &h);
if (r < 0)
return r;
xor_hash ^= le64toh(u->data.hash);
items[i].object_offset = htole64(h);
items[i].hash = u->data.hash;
r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
if (r < 0)
return r;
}
r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n,
NULL, NULL, NULL);
if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
return -EIO;
return r;
}
void journal_reset_metrics(JournalMetrics *m) {
assert(m);
/* Set everything to "pick automatic values". */
*m = (JournalMetrics) {
.min_use = (uint64_t) -1,
.max_use = (uint64_t) -1,
.min_size = (uint64_t) -1,
.max_size = (uint64_t) -1,
.keep_free = (uint64_t) -1,
.n_max_files = (uint64_t) -1,
};
}
void journal_default_metrics(JournalMetrics *m, int fd) {
char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
struct statvfs ss;
uint64_t fs_size;
assert(m);
assert(fd >= 0);
if (fstatvfs(fd, &ss) >= 0)
fs_size = ss.f_frsize * ss.f_blocks;
else {
2018-02-20 11:40:07 +01:00
log_debug_errno(errno, "Failed to determine disk size: %m");
fs_size = 0;
}
if (m->max_use == (uint64_t) -1) {
if (fs_size > 0) {
m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
if (m->max_use > DEFAULT_MAX_USE_UPPER)
m->max_use = DEFAULT_MAX_USE_UPPER;
if (m->max_use < DEFAULT_MAX_USE_LOWER)
m->max_use = DEFAULT_MAX_USE_LOWER;
} else
m->max_use = DEFAULT_MAX_USE_LOWER;
} else {
m->max_use = PAGE_ALIGN(m->max_use);
if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
m->max_use = JOURNAL_FILE_SIZE_MIN*2;
}
if (m->min_use == (uint64_t) -1)
m->min_use = DEFAULT_MIN_USE;
if (m->min_use > m->max_use)
m->min_use = m->max_use;
if (m->max_size == (uint64_t) -1) {
m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
m->max_size = DEFAULT_MAX_SIZE_UPPER;
} else
m->max_size = PAGE_ALIGN(m->max_size);
if (m->max_size != 0) {
if (m->max_size < JOURNAL_FILE_SIZE_MIN)
m->max_size = JOURNAL_FILE_SIZE_MIN;
if (m->max_use != 0 && m->max_size*2 > m->max_use)
m->max_use = m->max_size*2;
}
if (m->min_size == (uint64_t) -1)
m->min_size = JOURNAL_FILE_SIZE_MIN;
else {
m->min_size = PAGE_ALIGN(m->min_size);
if (m->min_size < JOURNAL_FILE_SIZE_MIN)
m->min_size = JOURNAL_FILE_SIZE_MIN;
if (m->max_size != 0 && m->min_size > m->max_size)
m->max_size = m->min_size;
}
if (m->keep_free == (uint64_t) -1) {
if (fs_size > 0) {
m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
m->keep_free = DEFAULT_KEEP_FREE_UPPER;
} else
m->keep_free = DEFAULT_KEEP_FREE;
}
if (m->n_max_files == (uint64_t) -1)
m->n_max_files = DEFAULT_N_MAX_FILES;
log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
format_bytes(a, sizeof(a), m->min_use),
format_bytes(b, sizeof(b), m->max_use),
format_bytes(c, sizeof(c), m->max_size),
format_bytes(d, sizeof(d), m->min_size),
format_bytes(e, sizeof(e), m->keep_free),
m->n_max_files);
}
int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
assert(f);
assert(f->header);
assert(from || to);
if (from) {
if (f->header->head_entry_realtime == 0)
return -ENOENT;
*from = le64toh(f->header->head_entry_realtime);
}
if (to) {
if (f->header->tail_entry_realtime == 0)
return -ENOENT;
*to = le64toh(f->header->tail_entry_realtime);
}
return 1;
}
int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
Object *o;
uint64_t p;
int r;
assert(f);
assert(from || to);
r = find_data_object_by_boot_id(f, boot_id, &o, &p);
if (r <= 0)
return r;
if (le64toh(o->data.n_entries) <= 0)
return 0;
if (from) {
r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
if (r < 0)
return r;
*from = le64toh(o->entry.monotonic);
}
if (to) {
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
if (r < 0)
return r;
r = generic_array_get_plus_one(f,
le64toh(o->data.entry_offset),
le64toh(o->data.entry_array_offset),
le64toh(o->data.n_entries)-1,
&o, NULL);
if (r <= 0)
return r;
*to = le64toh(o->entry.monotonic);
}
return 1;
}
bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
assert(f);
assert(f->header);
/* If we gained new header fields we gained new features,
* hence suggest a rotation */
if (le64toh(f->header->header_size) < sizeof(Header)) {
log_debug("%s uses an outdated header, suggesting rotation.", f->path);
return true;
}
/* Let's check if the hash tables grew over a certain fill
* level (75%, borrowing this value from Java's hash table
* implementation), and if so suggest a rotation. To calculate
* the fill level we need the n_data field, which only exists
* in newer versions. */
if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
f->path,
100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
le64toh(f->header->n_data),
le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
(unsigned long long) f->last_stat.st_size,
f->last_stat.st_size / le64toh(f->header->n_data));
return true;
}
if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
f->path,
100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
le64toh(f->header->n_fields),
le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
return true;
}
/* Are the data objects properly indexed by field objects? */
if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
le64toh(f->header->n_data) > 0 &&
le64toh(f->header->n_fields) == 0)
return true;
if (max_file_usec > 0) {
usec_t t, h;
h = le64toh(f->header->head_entry_realtime);
t = now(CLOCK_REALTIME);
if (h > 0 && t > h + max_file_usec)
return true;
}
return false;
}