shared/utf8: add utf8_is_valid_n()

Sometimes we need to check strings without the terminating NUL. Add a variant
that does that.
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2020-09-01 10:43:21 +02:00
parent e12b6e1951
commit 80ab31a435
3 changed files with 35 additions and 8 deletions

View file

@ -150,18 +150,22 @@ bool utf8_is_printable_newline(const char* str, size_t length, bool allow_newlin
return true;
}
char *utf8_is_valid(const char *str) {
const char *p;
char *utf8_is_valid_n(const char *str, size_t len_bytes) {
/* Check if the string is composed of valid utf8 characters. If length len_bytes is given, stop after
* len_bytes. Otherwise, stop at NUL. */
assert(str);
p = str;
while (*p) {
for (const char *p = str; len_bytes != (size_t) -1 ? (size_t) (p - str) < len_bytes : *p != '\0'; ) {
int len;
len = utf8_encoded_valid_unichar(p, (size_t) -1);
if (len < 0)
return NULL;
if (_unlikely_(*p == '\0') && len_bytes != (size_t) -1)
return NULL; /* embedded NUL */
len = utf8_encoded_valid_unichar(p,
len_bytes != (size_t) -1 ? len_bytes - (p - str) : (size_t) -1);
if (_unlikely_(len < 0))
return NULL; /* invalid character */
p += len;
}

View file

@ -14,7 +14,10 @@
bool unichar_is_valid(char32_t c);
char *utf8_is_valid(const char *s) _pure_;
char *utf8_is_valid_n(const char *str, size_t len_bytes) _pure_;
static inline char *utf8_is_valid(const char *s) {
return utf8_is_valid_n(s, (size_t) -1);
}
char *ascii_is_valid(const char *s) _pure_;
char *ascii_is_valid_n(const char *str, size_t len);

View file

@ -18,6 +18,25 @@ static void test_utf8_is_printable(void) {
assert_se(utf8_is_printable("\t", 1));
}
static void test_utf8_n_is_valid(void) {
log_info("/* %s */", __func__);
assert_se( utf8_is_valid_n("ascii is valid unicode", 21));
assert_se( utf8_is_valid_n("ascii is valid unicode", 22));
assert_se(!utf8_is_valid_n("ascii is valid unicode", 23));
assert_se( utf8_is_valid_n("\342\204\242", 0));
assert_se(!utf8_is_valid_n("\342\204\242", 1));
assert_se(!utf8_is_valid_n("\342\204\242", 2));
assert_se( utf8_is_valid_n("\342\204\242", 3));
assert_se(!utf8_is_valid_n("\342\204\242", 4));
assert_se( utf8_is_valid_n("<ZZ>", 0));
assert_se( utf8_is_valid_n("<ZZ>", 1));
assert_se( utf8_is_valid_n("<ZZ>", 2));
assert_se( utf8_is_valid_n("<ZZ>", 3));
assert_se( utf8_is_valid_n("<ZZ>", 4));
assert_se(!utf8_is_valid_n("<ZZ>", 5));
}
static void test_utf8_is_valid(void) {
log_info("/* %s */", __func__);
@ -216,6 +235,7 @@ static void test_utf8_to_utf16(void) {
}
int main(int argc, char *argv[]) {
test_utf8_n_is_valid();
test_utf8_is_valid();
test_utf8_is_printable();
test_ascii_is_valid();