Systemd/src/journal/journal-file.h

267 lines
9.6 KiB
C
Raw Normal View History

#pragma once
/***
This file is part of systemd.
Copyright 2011 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <inttypes.h>
#if HAVE_GCRYPT
#include <gcrypt.h>
#endif
#include "sd-id128.h"
#include "hashmap.h"
#include "journal-def.h"
#include "macro.h"
#include "mmap-cache.h"
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
#include "sd-event.h"
#include "sparse-endian.h"
typedef struct JournalMetrics {
/* For all these: -1 means "pick automatically", and 0 means "no limit enforced" */
uint64_t max_size; /* how large journal files grow at max */
uint64_t min_size; /* how large journal files grow at least */
uint64_t max_use; /* how much disk space to use in total at max, keep_free permitting */
uint64_t min_use; /* how much disk space to use in total at least, even if keep_free says not to */
uint64_t keep_free; /* how much to keep free on disk */
uint64_t n_max_files; /* how many files to keep around at max */
} JournalMetrics;
typedef enum direction {
DIRECTION_UP,
DIRECTION_DOWN
} direction_t;
typedef enum LocationType {
/* The first and last entries, resp. */
LOCATION_HEAD,
LOCATION_TAIL,
/* We already read the entry we currently point to, and the
* next one to read should probably not be this one again. */
LOCATION_DISCRETE,
/* We should seek to the precise location specified, and
* return it, as we haven't read it yet. */
LOCATION_SEEK
} LocationType;
typedef enum OfflineState {
OFFLINE_JOINED,
OFFLINE_SYNCING,
OFFLINE_OFFLINING,
OFFLINE_CANCEL,
OFFLINE_AGAIN_FROM_SYNCING,
OFFLINE_AGAIN_FROM_OFFLINING,
OFFLINE_DONE
} OfflineState;
typedef struct JournalFile {
int fd;
MMapFileDescriptor *cache_fd;
2011-10-13 05:19:35 +02:00
mode_t mode;
2011-10-13 05:19:35 +02:00
int flags;
int prot;
2013-11-26 18:40:23 +01:00
bool writable:1;
bool compress_xz:1;
bool compress_lz4:1;
2013-11-26 18:40:23 +01:00
bool seal:1;
bool defrag_on_close:1;
bool close_fd:1;
bool archive:1;
2013-11-26 18:40:23 +01:00
bool tail_entry_monotonic_valid:1;
direction_t last_direction;
LocationType location_type;
uint64_t last_n_entries;
char *path;
struct stat last_stat;
usec_t last_stat_usec;
Header *header;
HashItem *data_hash_table;
HashItem *field_hash_table;
uint64_t current_offset;
uint64_t current_seqnum;
uint64_t current_realtime;
uint64_t current_monotonic;
sd_id128_t current_boot_id;
uint64_t current_xor_hash;
JournalMetrics metrics;
MMapCache *mmap;
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
sd_event_source *post_change_timer;
usec_t post_change_timer_period;
OrderedHashmap *chain_cache;
pthread_t offline_thread;
volatile OfflineState offline_state;
#if HAVE_XZ || HAVE_LZ4
void *compress_buffer;
size_t compress_buffer_size;
#endif
#if HAVE_GCRYPT
gcry_md_hd_t hmac;
bool hmac_running;
FSSHeader *fss_file;
size_t fss_file_size;
uint64_t fss_start_usec;
uint64_t fss_interval_usec;
2012-08-16 20:51:43 +02:00
void *fsprg_state;
size_t fsprg_state_size;
void *fsprg_seed;
size_t fsprg_seed_size;
#endif
} JournalFile;
int journal_file_open(
int fd,
const char *fname,
int flags,
mode_t mode,
bool compress,
bool seal,
JournalMetrics *metrics,
MMapCache *mmap_cache,
Set *deferred_closes,
JournalFile *template,
JournalFile **ret);
int journal_file_set_offline(JournalFile *f, bool wait);
bool journal_file_is_offlining(JournalFile *f);
JournalFile* journal_file_close(JournalFile *j);
void journal_file_close_set(Set *s);
int journal_file_open_reliably(
const char *fname,
int flags,
mode_t mode,
bool compress,
bool seal,
JournalMetrics *metrics,
MMapCache *mmap_cache,
Set *deferred_closes,
JournalFile *template,
JournalFile **ret);
2012-08-16 01:51:54 +02:00
#define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
#define VALID64(x) (((x) & 7ULL) == 0ULL)
2012-08-16 01:51:54 +02:00
/* Use six characters to cover the offsets common in smallish journal
* files without adding too many zeros. */
#define OFSfmt "%06"PRIx64
2012-08-19 15:15:59 +02:00
static inline bool VALID_REALTIME(uint64_t u) {
/* This considers timestamps until the year 3112 valid. That should be plenty room... */
return u > 0 && u < (1ULL << 55);
}
static inline bool VALID_MONOTONIC(uint64_t u) {
/* This considers timestamps until 1142 years of runtime valid. */
return u < (1ULL << 55);
}
static inline bool VALID_EPOCH(uint64_t u) {
/* This allows changing the key for 1142 years, every usec. */
return u < (1ULL << 55);
}
2012-08-16 01:51:54 +02:00
#define JOURNAL_HEADER_CONTAINS(h, field) \
(le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
#define JOURNAL_HEADER_SEALED(h) \
(!!(le32toh((h)->compatible_flags) & HEADER_COMPATIBLE_SEALED))
#define JOURNAL_HEADER_COMPRESSED_XZ(h) \
(!!(le32toh((h)->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
#define JOURNAL_HEADER_COMPRESSED_LZ4(h) \
(!!(le32toh((h)->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
2014-12-10 15:18:49 +01:00
int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret);
uint64_t journal_file_entry_n_items(Object *o) _pure_;
uint64_t journal_file_entry_array_n_items(Object *o) _pure_;
uint64_t journal_file_hash_table_n_items(Object *o) _pure_;
2014-12-10 15:18:49 +01:00
int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset);
int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqno, Object **ret, uint64_t *offset);
int journal_file_find_data_object(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset);
int journal_file_find_data_object_with_hash(JournalFile *f, const void *data, uint64_t size, uint64_t hash, Object **ret, uint64_t *offset);
int journal_file_find_field_object(JournalFile *f, const void *field, uint64_t size, Object **ret, uint64_t *offset);
int journal_file_find_field_object_with_hash(JournalFile *f, const void *field, uint64_t size, uint64_t hash, Object **ret, uint64_t *offset);
void journal_file_reset_location(JournalFile *f);
void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset);
int journal_file_compare_locations(JournalFile *af, JournalFile *bf);
int journal_file_next_entry(JournalFile *f, uint64_t p, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_next_entry_for_data(JournalFile *f, Object *o, uint64_t p, uint64_t data_offset, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_seqnum(JournalFile *f, uint64_t seqnum, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_realtime(JournalFile *f, uint64_t realtime, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_monotonic(JournalFile *f, sd_id128_t boot_id, uint64_t monotonic, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_offset_for_data(JournalFile *f, uint64_t data_offset, uint64_t p, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_seqnum_for_data(JournalFile *f, uint64_t data_offset, uint64_t seqnum, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_realtime_for_data(JournalFile *f, uint64_t data_offset, uint64_t realtime, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_move_to_entry_by_monotonic_for_data(JournalFile *f, uint64_t data_offset, sd_id128_t boot_id, uint64_t monotonic, direction_t direction, Object **ret, uint64_t *offset);
int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset);
void journal_file_dump(JournalFile *f);
void journal_file_print_header(JournalFile *f);
int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes);
2011-10-13 05:19:35 +02:00
void journal_file_post_change(JournalFile *f);
journal: coalesce ftruncate()s in 250ms windows Prior to this change every journal append causes an ftruncate() for the sake of inotify propagation of the mmap-based writes. With this change the notification is deferred up to ~250ms, coalescing any repeated journal writes during the deferred period into a single ftruncate(). The ftruncate() call isn't free and doing it on every append adds unnecessary overhead and latency in the journald event loop. Introduces journal_file_enable_post_change_timer() which manages a timer on the provided sd-event instance for scheduling coalesced ftruncates. The ftruncate() behavior is unchanged unless journal_file_enable_post_change_timer() is called on the JournalFile. While not a tremendous improvement, profiling systemd-journald event loop latencies using instrumentation as introduced by 34b8751 it was observed that coalescing the ftruncates was low-hanging fruit worth pursuing. Note orders 12 and 13 shifting left into order 11 and order 6 dipping into order 5: Unmodified: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [10685.414572] 0 0 0 0 38 602 61 2 290 60 1643 2554 13 1 4 1 0 0 1 [10690.415114] 0 0 0 0 0 646 54 7 309 44 2073 2148 17 1 3 0 0 0 1 [10695.415509] 0 0 0 0 1 650 73 3 324 37 2071 2270 9 0 0 1 0 1 0 [10700.416297] 0 0 0 0 0 659 50 4 318 38 2111 2152 6 0 1 0 0 1 1 [10705.417136] 0 0 0 0 2 660 48 4 320 38 2129 2146 12 1 1 0 0 1 1 [10710.489114] 0 0 0 0 0 673 38 3 321 37 1925 2339 7 0 0 0 0 1 1 [10715.489613] 0 0 0 0 3 656 64 8 317 48 2365 2007 7 0 0 0 0 0 1 Coalesced: log2(us) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ----------------------------------------------------------- [ 6169.161360] 0 0 0 1 24 786 54 11 389 24 4192 771 6 4 0 0 1 0 1 [ 6174.161705] 0 0 0 1 18 800 35 6 380 27 3977 893 3 1 0 0 1 0 1 [ 6179.162741] 0 0 0 1 28 768 51 4 391 16 3998 831 5 3 0 0 0 0 2 [ 6184.162856] 0 0 0 0 19 770 60 2 376 26 3795 1004 9 5 1 0 1 0 1 [ 6189.163279] 0 0 0 0 28 761 49 7 372 27 3729 1056 3 2 0 0 1 0 1 [ 6194.164255] 0 0 0 0 25 785 49 7 394 19 3996 908 6 3 2 0 0 0 1 [ 6199.164658] 0 0 0 0 29 797 35 5 389 18 3995 898 3 4 1 1 1 0 1 The remaining high-order delays are a result of the synchronous fsyncs in systemd-journald, beyond the scope of this commit.
2015-12-11 07:42:22 +01:00
int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t);
void journal_reset_metrics(JournalMetrics *m);
void journal_default_metrics(JournalMetrics *m, int fd);
int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to);
int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot, usec_t *from, usec_t *to);
bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec);
int journal_file_map_data_hash_table(JournalFile *f);
int journal_file_map_field_hash_table(JournalFile *f);
static inline bool JOURNAL_FILE_COMPRESS(JournalFile *f) {
assert(f);
return f->compress_xz || f->compress_lz4;
}