shared/utf8: add utf8_is_valid_n()

Sometimes we need to check strings without the terminating NUL. Add a variant that does that.
2020-09-01 10:43:21 +02:00 · 2020-09-01 10:43:21 +02:00 · 80ab31a435
parent e12b6e1951
commit 80ab31a435
3 changed files with 35 additions and 8 deletions
--- a/src/basic/utf8.c
+++ b/src/basic/utf8.c
@ -150,18 +150,22 @@ bool utf8_is_printable_newline(const char* str, size_t length, bool allow_newlin
        return true;
 }

-char *utf8_is_valid(const char *str) {
-        const char *p;
+char *utf8_is_valid_n(const char *str, size_t len_bytes) {
+        /* Check if the string is composed of valid utf8 characters. If length len_bytes is given, stop after
+         * len_bytes. Otherwise, stop at NUL. */

        assert(str);

-        p = str;
-        while (*p) {
+        for (const char *p = str; len_bytes != (size_t) -1 ? (size_t) (p - str) < len_bytes : *p != '\0'; ) {
                int len;

-                len = utf8_encoded_valid_unichar(p, (size_t) -1);
-                if (len < 0)
-                        return NULL;
+                if (_unlikely_(*p == '\0') && len_bytes != (size_t) -1)
+                        return NULL; /* embedded NUL */
+
+                len = utf8_encoded_valid_unichar(p,
+                                                 len_bytes != (size_t) -1 ? len_bytes - (p - str) : (size_t) -1);
+                if (_unlikely_(len < 0))
+                        return NULL; /* invalid character */

                p += len;
        }
--- a/src/basic/utf8.h
+++ b/src/basic/utf8.h
@ -14,7 +14,10 @@

 bool unichar_is_valid(char32_t c);

-char *utf8_is_valid(const char *s) _pure_;
+char *utf8_is_valid_n(const char *str, size_t len_bytes) _pure_;
+static inline char *utf8_is_valid(const char *s) {
+        return utf8_is_valid_n(s, (size_t) -1);
+}
 char *ascii_is_valid(const char *s) _pure_;
 char *ascii_is_valid_n(const char *str, size_t len);

--- a/src/test/test-utf8.c
+++ b/src/test/test-utf8.c
@ -18,6 +18,25 @@ static void test_utf8_is_printable(void) {
        assert_se(utf8_is_printable("\t", 1));
 }

+static void test_utf8_n_is_valid(void) {
+        log_info("/* %s */", __func__);
+
+        assert_se( utf8_is_valid_n("ascii is valid unicode", 21));
+        assert_se( utf8_is_valid_n("ascii is valid unicode", 22));
+        assert_se(!utf8_is_valid_n("ascii is valid unicode", 23));
+        assert_se( utf8_is_valid_n("\342\204\242", 0));
+        assert_se(!utf8_is_valid_n("\342\204\242", 1));
+        assert_se(!utf8_is_valid_n("\342\204\242", 2));
+        assert_se( utf8_is_valid_n("\342\204\242", 3));
+        assert_se(!utf8_is_valid_n("\342\204\242", 4));
+        assert_se( utf8_is_valid_n("<ZZ>", 0));
+        assert_se( utf8_is_valid_n("<ZZ>", 1));
+        assert_se( utf8_is_valid_n("<ZZ>", 2));
+        assert_se( utf8_is_valid_n("<ZZ>", 3));
+        assert_se( utf8_is_valid_n("<ZZ>", 4));
+        assert_se(!utf8_is_valid_n("<ZZ>", 5));
+}
+
 static void test_utf8_is_valid(void) {
        log_info("/* %s */", __func__);

@ -216,6 +235,7 @@ static void test_utf8_to_utf16(void) {
 }

 int main(int argc, char *argv[]) {
+        test_utf8_n_is_valid();
        test_utf8_is_valid();
        test_utf8_is_printable();
        test_ascii_is_valid();