fileio: make read_line() handle various line endings correctly

This adds support for windows line endings.

More importantly though with this change a newline followed by EOF is
considered a single line end.
This commit is contained in:
Lennart Poettering 2018-12-12 13:41:25 +01:00 committed by Zbigniew Jędrzejewski-Szmek
parent 57db447ebf
commit 838894b0c6
2 changed files with 81 additions and 8 deletions

View file

@ -694,18 +694,48 @@ int read_nul_string(FILE *f, char **ret) {
return 0;
}
/* A bitmask of the EOL markers we know */
typedef enum EndOfLineMarker {
EOL_NONE = 0,
EOL_ZERO = 1 << 0, /* \0 (aka NUL) */
EOL_TEN = 1 << 1, /* \n (aka NL, aka LF) */
EOL_THIRTEEN = 1 << 2, /* \r (aka CR) */
} EndOfLineMarker;
static EndOfLineMarker categorize_eol(char c) {
if (c == '\n')
return EOL_TEN;
if (c == '\r')
return EOL_THIRTEEN;
if (c == '\0')
return EOL_ZERO;
return EOL_NONE;
}
DEFINE_TRIVIAL_CLEANUP_FUNC(FILE*, funlockfile);
int read_line(FILE *f, size_t limit, char **ret) {
_cleanup_free_ char *buffer = NULL;
size_t n = 0, allocated = 0, count = 0;
_cleanup_free_ char *buffer = NULL;
assert(f);
/* Something like a bounded version of getline().
*
* Considers EOF, \n and \0 end of line delimiters, and does not include these delimiters in the string
* returned.
* Considers EOF, \n, \r and \0 end of line delimiters (or combinations of these), and does not include these
* delimiters in the string returned. Specifically, recognizes the following combinations of markers as line
* endings:
*
* \n (UNIX)
* \r (old MacOS)
* \0 (C strings)
* \n\0
* \r\0
* \r\n (Windows)
* \n\r
* \r\n\0
* \n\r\0
*
* Returns the number of bytes read from the files (i.e. including delimiters this hence usually differs from
* the number of characters in the returned string). When EOF is hit, 0 is returned.
@ -722,9 +752,11 @@ int read_line(FILE *f, size_t limit, char **ret) {
{
_unused_ _cleanup_(funlockfilep) FILE *flocked = f;
EndOfLineMarker previous_eol = EOL_NONE;
flockfile(f);
for (;;) {
EndOfLineMarker eol;
int c;
if (n >= limit)
@ -737,13 +769,29 @@ int read_line(FILE *f, size_t limit, char **ret) {
if (ferror_unlocked(f) && n == 0)
return errno > 0 ? -errno : -EIO;
/* EOF is line ending too. */
break;
}
count++;
if (IN_SET(c, '\n', 0)) /* Reached a delimiter */
eol = categorize_eol(c);
if (FLAGS_SET(previous_eol, EOL_ZERO) ||
(eol == EOL_NONE && previous_eol != EOL_NONE) ||
(eol != EOL_NONE && (previous_eol & eol) != 0)) {
/* Previous char was a NUL? This is not an EOL, but the previous char was? This type of
* EOL marker has been seen right before? In either of these three cases we are
* done. But first, let's put this character back in the queue. */
assert_se(ungetc(c, f) != EOF);
count--;
break;
}
if (eol != EOL_NONE) {
previous_eol |= eol;
continue;
}
if (ret) {
if (!GREEDY_REALLOC(buffer, allocated, n + 2))

View file

@ -612,6 +612,13 @@ static void test_tempfn(void) {
static const char buffer[] =
"Some test data\n"
"Some weird line\r"
"terminators\r\n"
"and even more\n\r"
"now the same with a NUL\n\0"
"and more\r\0"
"and even more\r\n\0"
"and yet even more\n\r\0"
"With newlines, and a NUL byte\0"
"\n"
"an empty line\n"
@ -624,6 +631,27 @@ static void test_read_line_one_file(FILE *f) {
assert_se(read_line(f, (size_t) -1, &line) == 15 && streq(line, "Some test data"));
line = mfree(line);
assert_se(read_line(f, (size_t) -1, &line) == 16 && streq(line, "Some weird line"));
line = mfree(line);
assert_se(read_line(f, (size_t) -1, &line) == 13 && streq(line, "terminators"));
line = mfree(line);
assert_se(read_line(f, (size_t) -1, &line) == 15 && streq(line, "and even more"));
line = mfree(line);
assert_se(read_line(f, (size_t) -1, &line) == 25 && streq(line, "now the same with a NUL"));
line = mfree(line);
assert_se(read_line(f, (size_t) -1, &line) == 10 && streq(line, "and more"));
line = mfree(line);
assert_se(read_line(f, (size_t) -1, &line) == 16 && streq(line, "and even more"));
line = mfree(line);
assert_se(read_line(f, (size_t) -1, &line) == 20 && streq(line, "and yet even more"));
line = mfree(line);
assert_se(read_line(f, 1024, &line) == 30 && streq(line, "With newlines, and a NUL byte"));
line = mfree(line);
@ -640,10 +668,7 @@ static void test_read_line_one_file(FILE *f) {
/* read_line() stopped when it hit the limit, that means when we continue reading we'll read at the first
* character after the previous limit. Let's make use of tha to continue our test. */
assert_se(read_line(f, 1024, &line) == 61 && streq(line, "line that is supposed to be truncated, because it is so long"));
line = mfree(line);
assert_se(read_line(f, 1024, &line) == 1 && streq(line, ""));
assert_se(read_line(f, 1024, &line) == 62 && streq(line, "line that is supposed to be truncated, because it is so long"));
line = mfree(line);
assert_se(read_line(f, 1024, &line) == 0 && streq(line, ""));