Systemd/src/journal/journal-file.h
Vito Caputo 7a24f3bf2f journal: coalesce ftruncate()s in 250ms windows
Prior to this change every journal append causes an ftruncate() for the
sake of inotify propagation of the mmap-based writes.

With this change the notification is deferred up to ~250ms, coalescing
any repeated journal writes during the deferred period into a single
ftruncate().  The ftruncate() call isn't free and doing it on every
append adds unnecessary overhead and latency in the journald event loop.

Introduces journal_file_enable_post_change_timer() which manages a
timer on the provided sd-event instance for scheduling coalesced
ftruncates.  The ftruncate() behavior is unchanged unless
journal_file_enable_post_change_timer() is called on the JournalFile.

While not a tremendous improvement, profiling systemd-journald event loop
latencies using instrumentation as introduced by 34b8751 it was observed that
coalescing the ftruncates was low-hanging fruit worth pursuing.

Note orders 12 and 13 shifting left into order 11 and order 6 dipping into
order 5:

Unmodified:
     log2(us)   1 2 3  4 5  6   7   8  9   10 11   12   13 14 15 16 17 18 19
                -----------------------------------------------------------
[10685.414572]  0 0 0  0 38 602 61  2  290 60 1643 2554 13 1  4  1  0  0  1
[10690.415114]  0 0 0  0 0  646 54  7  309 44 2073 2148 17 1  3  0  0  0  1
[10695.415509]  0 0 0  0 1  650 73  3  324 37 2071 2270 9  0  0  1  0  1  0
[10700.416297]  0 0 0  0 0  659 50  4  318 38 2111 2152 6  0  1  0  0  1  1
[10705.417136]  0 0 0  0 2  660 48  4  320 38 2129 2146 12 1  1  0  0  1  1
[10710.489114]  0 0 0  0 0  673 38  3  321 37 1925 2339 7  0  0  0  0  1  1
[10715.489613]  0 0 0  0 3  656 64  8  317 48 2365 2007 7  0  0  0  0  0  1

Coalesced:
     log2(us)   1 2 3  4 5  6   7   8  9   10 11   12   13 14 15 16 17 18 19
                -----------------------------------------------------------
[ 6169.161360]  0 0 0  1 24 786 54  11 389 24 4192 771  6  4  0  0  1  0  1
[ 6174.161705]  0 0 0  1 18 800 35  6  380 27 3977 893  3  1  0  0  1  0  1
[ 6179.162741]  0 0 0  1 28 768 51  4  391 16 3998 831  5  3  0  0  0  0  2
[ 6184.162856]  0 0 0  0 19 770 60  2  376 26 3795 1004 9  5  1  0  1  0  1
[ 6189.163279]  0 0 0  0 28 761 49  7  372 27 3729 1056 3  2  0  0  1  0  1
[ 6194.164255]  0 0 0  0 25 785 49  7  394 19 3996 908  6  3  2  0  0  0  1
[ 6199.164658]  0 0 0  0 29 797 35  5  389 18 3995 898  3  4  1  1  1  0  1

The remaining high-order delays are a result of the synchronous fsyncs in
systemd-journald, beyond the scope of this commit.
2016-01-14 16:36:07 -08:00

248 lines
9.1 KiB
C

/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
#pragma once
/***
This file is part of systemd.
Copyright 2011 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <inttypes.h>
#ifdef HAVE_GCRYPT
#include <gcrypt.h>
#endif
#include "sd-id128.h"
#include "hashmap.h"
#include "journal-def.h"
#include "macro.h"
#include "mmap-cache.h"
#include "sd-event.h"
#include "sparse-endian.h"
typedef struct JournalMetrics {
/* For all these: -1 means "pick automatically", and 0 means "no limit enforced" */
uint64_t max_size; /* how large journal files grow at max */
uint64_t min_size; /* how large journal files grow at least */
uint64_t max_use; /* how much disk space to use in total at max, keep_free permitting */
uint64_t min_use; /* how much disk space to use in total at least, even if keep_free says not to */
uint64_t keep_free; /* how much to keep free on disk */
uint64_t n_max_files; /* how many files to keep around at max */
} JournalMetrics;
typedef enum direction {
DIRECTION_UP,
DIRECTION_DOWN
} direction_t;
typedef enum LocationType {
/* The first and last entries, resp. */
LOCATION_HEAD,
LOCATION_TAIL,
/* We already read the entry we currently point to, and the
* next one to read should probably not be this one again. */
LOCATION_DISCRETE,
/* We should seek to the precise location specified, and
* return it, as we haven't read it yet. */
LOCATION_SEEK
} LocationType;
typedef struct JournalFile {
int fd;
mode_t mode;
int flags;
int prot;
bool writable:1;
bool compress_xz:1;
bool compress_lz4:1;
bool seal:1;
bool defrag_on_close:1;
bool tail_entry_monotonic_valid:1;
direction_t last_direction;
LocationType location_type;
uint64_t last_n_entries;
char *path;
struct stat last_stat;
usec_t last_stat_usec;
Header *header;
HashItem *data_hash_table;
HashItem *field_hash_table;
uint64_t current_offset;
uint64_t current_seqnum;
uint64_t current_realtime;
uint64_t current_monotonic;
sd_id128_t current_boot_id;
uint64_t current_xor_hash;
JournalMetrics metrics;
MMapCache *mmap;
sd_event_source *post_change_timer;
usec_t post_change_timer_period;
OrderedHashmap *chain_cache;
#if defined(HAVE_XZ) || defined(HAVE_LZ4)
void *compress_buffer;
size_t compress_buffer_size;
#endif
#ifdef HAVE_GCRYPT
gcry_md_hd_t hmac;
bool hmac_running;
FSSHeader *fss_file;
size_t fss_file_size;
uint64_t fss_start_usec;
uint64_t fss_interval_usec;
void *fsprg_state;
size_t fsprg_state_size;
void *fsprg_seed;
size_t fsprg_seed_size;
#endif
} JournalFile;
int journal_file_open(
const char *fname,
int flags,
mode_t mode,
bool compress,
bool seal,
JournalMetrics *metrics,
MMapCache *mmap_cache,
JournalFile *template,
JournalFile **ret);
int journal_file_set_offline(JournalFile *f);
JournalFile* journal_file_close(JournalFile *j);
int journal_file_open_reliably(
const char *fname,
int flags,
mode_t mode,
bool compress,
bool seal,
JournalMetrics *metrics,
MMapCache *mmap_cache,
JournalFile *template,
JournalFile **ret);
#define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
#define VALID64(x) (((x) & 7ULL) == 0ULL)
/* Use six characters to cover the offsets common in smallish journal
* files without adding too many zeros. */
#define OFSfmt "%06"PRIx64
static inline bool VALID_REALTIME(uint64_t u) {
/* This considers timestamps until the year 3112 valid. That should be plenty room... */
return u > 0 && u < (1ULL << 55);
}
static inline bool VALID_MONOTONIC(uint64_t u) {
/* This considers timestamps until 1142 years of runtime valid. */
return u < (1ULL << 55);
}
static inline bool VALID_EPOCH(uint64_t u) {
/* This allows changing the key for 1142 years, every usec. */
return u < (1ULL << 55);
}
#define JOURNAL_HEADER_CONTAINS(h, field) \
(le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
#define JOURNAL_HEADER_SEALED(h) \
(!!(le32toh((h)->compatible_flags) & HEADER_COMPATIBLE_SEALED))
#define JOURNAL_HEADER_COMPRESSED_XZ(h) \
(!!(le32toh((h)->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
#define JOURNAL_HEADER_COMPRESSED_LZ4(h) \
(!!(le32toh((h)->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret);
uint64_t journal_file_entry_n_items(Object *o) _pure_;
uint64_t journal_file_entry_array_n_items(Object *o) _pure_;
uint64_t journal_file_hash_table_n_items(Object *o) _pure_;
int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset);
int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqno, Object **ret, uint64_t *offset);
int journal_file_find_data_object(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset);
int journal_file_find_data_object_with_hash(JournalFile *f, const void *data, uint64_t size, uint64_t hash, Object **ret, uint64_t *offset);
int journal_file_find_field_object(JournalFile *f, const void *field, uint64_t size, Object **ret, uint64_t *offset);
int journal_file_find_field_object_with_hash(JournalFile *f, const void *field, uint64_t size, uint64_t hash, Object **ret, uint64_t *offset);
void journal_file_reset_location(JournalFile *f);
void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset);
int journal_file_compare_locations(JournalFile *af, JournalFile *bf);
int journal_file_next_entry(JournalFile *f, uint64_t p, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_next_entry_for_data(JournalFile *f, Object *o, uint64_t p, uint64_t data_offset, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_seqnum(JournalFile *f, uint64_t seqnum, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_realtime(JournalFile *f, uint64_t realtime, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_monotonic(JournalFile *f, sd_id128_t boot_id, uint64_t monotonic, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_offset_for_data(JournalFile *f, uint64_t data_offset, uint64_t p, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_seqnum_for_data(JournalFile *f, uint64_t data_offset, uint64_t seqnum, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_realtime_for_data(JournalFile *f, uint64_t data_offset, uint64_t realtime, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_monotonic_for_data(JournalFile *f, uint64_t data_offset, sd_id128_t boot_id, uint64_t monotonic, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset);
void journal_file_dump(JournalFile *f);
void journal_file_print_header(JournalFile *f);
int journal_file_rotate(JournalFile **f, bool compress, bool seal);
void journal_file_post_change(JournalFile *f);
int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t);
void journal_reset_metrics(JournalMetrics *m);
void journal_default_metrics(JournalMetrics *m, int fd);
int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to);
int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot, usec_t *from, usec_t *to);
bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec);
int journal_file_map_data_hash_table(JournalFile *f);
int journal_file_map_field_hash_table(JournalFile *f);
static inline bool JOURNAL_FILE_COMPRESS(JournalFile *f) {
assert(f);
return f->compress_xz || f->compress_lz4;
}