Systemd/src/basic/extract-word.c
Zbigniew Jędrzejewski-Szmek 3565e09594 basic/escape: merge utf8 and non-utf8 paths in cunescape_one
Not every byte sequence is valid utf8. We allow escaping of non-utf8
sequences in strings by using octal and hexadecimal escape sequences
(\123 and \0xAB) for bytes at or above 128. Users of cunescape_one
could infer whether such use occured when they received an answer
between 128 and 256 in *ret (a non-ascii one byte character). But this
is subtle and misleading: the comments were wrong, because ascii is a
subset of unicode, so c != 0 did not mean non-unicode, but rather
ascii-subset-of-unicode-or-raw-byte. This was all rather confusing, so
make the "single byte" condition explicit.

I'm not convinced that allowing non-utf8 sequences to be produced is
useful in all cases where we allow it (e.g. in config files), but that
behaviour is unchanged, just made more explicit.

This also fixes an (invalid) gcc warning about unitialized variable
(*ret_unicode) in callers of cunescape_one.
2016-01-18 15:20:32 -05:00

301 lines
11 KiB
C

/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
/***
This file is part of systemd.
Copyright 2010 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <errno.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <syslog.h>
#include "alloc-util.h"
#include "escape.h"
#include "extract-word.h"
#include "log.h"
#include "macro.h"
#include "string-util.h"
#include "utf8.h"
int extract_first_word(const char **p, char **ret, const char *separators, ExtractFlags flags) {
_cleanup_free_ char *s = NULL;
size_t allocated = 0, sz = 0;
char c;
int r;
char quote = 0; /* 0 or ' or " */
bool backslash = false; /* whether we've just seen a backslash */
assert(p);
assert(ret);
/* Bail early if called after last value or with no input */
if (!*p)
goto finish_force_terminate;
c = **p;
if (!separators)
separators = WHITESPACE;
/* Parses the first word of a string, and returns it in
* *ret. Removes all quotes in the process. When parsing fails
* (because of an uneven number of quotes or similar), leaves
* the pointer *p at the first invalid character. */
if (flags & EXTRACT_DONT_COALESCE_SEPARATORS)
if (!GREEDY_REALLOC(s, allocated, sz+1))
return -ENOMEM;
for (;; (*p) ++, c = **p) {
if (c == 0)
goto finish_force_terminate;
else if (strchr(separators, c)) {
if (flags & EXTRACT_DONT_COALESCE_SEPARATORS) {
(*p) ++;
goto finish_force_next;
}
} else {
/* We found a non-blank character, so we will always
* want to return a string (even if it is empty),
* allocate it here. */
if (!GREEDY_REALLOC(s, allocated, sz+1))
return -ENOMEM;
break;
}
}
for (;; (*p) ++, c = **p) {
if (backslash) {
if (!GREEDY_REALLOC(s, allocated, sz+7))
return -ENOMEM;
if (c == 0) {
if ((flags & EXTRACT_CUNESCAPE_RELAX) &&
(!quote || flags & EXTRACT_RELAX)) {
/* If we find an unquoted trailing backslash and we're in
* EXTRACT_CUNESCAPE_RELAX mode, keep it verbatim in the
* output.
*
* Unbalanced quotes will only be allowed in EXTRACT_RELAX
* mode, EXTRACT_CUNESCAPE_RELAX mode does not allow them.
*/
s[sz++] = '\\';
goto finish_force_terminate;
}
if (flags & EXTRACT_RELAX)
goto finish_force_terminate;
return -EINVAL;
}
if (flags & EXTRACT_CUNESCAPE) {
uint32_t u;
bool eight_bit = false;
r = cunescape_one(*p, (size_t) -1, &u, &eight_bit);
if (r < 0) {
if (flags & EXTRACT_CUNESCAPE_RELAX) {
s[sz++] = '\\';
s[sz++] = c;
} else
return -EINVAL;
} else {
(*p) += r - 1;
if (eight_bit)
s[sz++] = u;
else
sz += utf8_encode_unichar(s + sz, u);
}
} else
s[sz++] = c;
backslash = false;
} else if (quote) { /* inside either single or double quotes */
for (;; (*p) ++, c = **p) {
if (c == 0) {
if (flags & EXTRACT_RELAX)
goto finish_force_terminate;
return -EINVAL;
} else if (c == quote) { /* found the end quote */
quote = 0;
break;
} else if (c == '\\' && !(flags & EXTRACT_RETAIN_ESCAPE)) {
backslash = true;
break;
} else {
if (!GREEDY_REALLOC(s, allocated, sz+2))
return -ENOMEM;
s[sz++] = c;
}
}
} else {
for (;; (*p) ++, c = **p) {
if (c == 0)
goto finish_force_terminate;
else if ((c == '\'' || c == '"') && (flags & EXTRACT_QUOTES)) {
quote = c;
break;
} else if (c == '\\' && !(flags & EXTRACT_RETAIN_ESCAPE)) {
backslash = true;
break;
} else if (strchr(separators, c)) {
if (flags & EXTRACT_DONT_COALESCE_SEPARATORS) {
(*p) ++;
goto finish_force_next;
}
/* Skip additional coalesced separators. */
for (;; (*p) ++, c = **p) {
if (c == 0)
goto finish_force_terminate;
if (!strchr(separators, c))
break;
}
goto finish;
} else {
if (!GREEDY_REALLOC(s, allocated, sz+2))
return -ENOMEM;
s[sz++] = c;
}
}
}
}
finish_force_terminate:
*p = NULL;
finish:
if (!s) {
*p = NULL;
*ret = NULL;
return 0;
}
finish_force_next:
s[sz] = 0;
*ret = s;
s = NULL;
return 1;
}
int extract_first_word_and_warn(
const char **p,
char **ret,
const char *separators,
ExtractFlags flags,
const char *unit,
const char *filename,
unsigned line,
const char *rvalue) {
/* Try to unquote it, if it fails, warn about it and try again
* but this time using EXTRACT_CUNESCAPE_RELAX to keep the
* backslashes verbatim in invalid escape sequences. */
const char *save;
int r;
save = *p;
r = extract_first_word(p, ret, separators, flags);
if (r >= 0)
return r;
if (r == -EINVAL && !(flags & EXTRACT_CUNESCAPE_RELAX)) {
/* Retry it with EXTRACT_CUNESCAPE_RELAX. */
*p = save;
r = extract_first_word(p, ret, separators, flags|EXTRACT_CUNESCAPE_RELAX);
if (r >= 0) {
/* It worked this time, hence it must have been an invalid escape sequence we could correct. */
log_syntax(unit, LOG_WARNING, filename, line, EINVAL, "Invalid escape sequences in line, correcting: \"%s\"", rvalue);
return r;
}
/* If it's still EINVAL; then it must be unbalanced quoting, report this. */
if (r == -EINVAL)
return log_syntax(unit, LOG_ERR, filename, line, r, "Unbalanced quoting, ignoring: \"%s\"", rvalue);
}
/* Can be any error, report it */
return log_syntax(unit, LOG_ERR, filename, line, r, "Unable to decode word \"%s\", ignoring: %m", rvalue);
}
int extract_many_words(const char **p, const char *separators, ExtractFlags flags, ...) {
va_list ap;
char **l;
int n = 0, i, c, r;
/* Parses a number of words from a string, stripping any
* quotes if necessary. */
assert(p);
/* Count how many words are expected */
va_start(ap, flags);
for (;;) {
if (!va_arg(ap, char **))
break;
n++;
}
va_end(ap);
if (n <= 0)
return 0;
/* Read all words into a temporary array */
l = newa0(char*, n);
for (c = 0; c < n; c++) {
r = extract_first_word(p, &l[c], separators, flags);
if (r < 0) {
int j;
for (j = 0; j < c; j++)
free(l[j]);
return r;
}
if (r == 0)
break;
}
/* If we managed to parse all words, return them in the passed
* in parameters */
va_start(ap, flags);
for (i = 0; i < n; i++) {
char **v;
v = va_arg(ap, char **);
assert(v);
*v = l[i];
}
va_end(ap);
return c;
}