From e71fb4b3020dd5881d8a55d169deb02b4c5f9638 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 18 Jul 2018 12:21:39 +0200 Subject: [PATCH 1/5] utf8: update utf8_is_valid() a bit Let's avoid a few casts in the function. Also, let's drop the "const" when returning the string, for similar reasons as strchr() and friends drop it: so that we don't add a const if the user passes in a non-const string. --- src/basic/utf8.c | 11 ++++++----- src/basic/utf8.h | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/basic/utf8.c b/src/basic/utf8.c index a5ce1a2944..40d4fbda62 100644 --- a/src/basic/utf8.c +++ b/src/basic/utf8.c @@ -154,22 +154,23 @@ bool utf8_is_printable_newline(const char* str, size_t length, bool newline) { return true; } -const char *utf8_is_valid(const char *str) { - const uint8_t *p; +char *utf8_is_valid(const char *str) { + const char *p; assert(str); - for (p = (const uint8_t*) str; *p; ) { + p = str; + while (*p) { int len; - len = utf8_encoded_valid_unichar((const char *)p); + len = utf8_encoded_valid_unichar(p); if (len < 0) return NULL; p += len; } - return str; + return (char*) str; } char *utf8_escape_invalid(const char *str) { diff --git a/src/basic/utf8.h b/src/basic/utf8.h index e8af7a576b..f5e9f8cacb 100644 --- a/src/basic/utf8.h +++ b/src/basic/utf8.h @@ -14,7 +14,7 @@ bool unichar_is_valid(char32_t c); -const char *utf8_is_valid(const char *s) _pure_; +char *utf8_is_valid(const char *s) _pure_; char *ascii_is_valid(const char *s) _pure_; char *ascii_is_valid_n(const char *str, size_t len); From 07667be733e1976bb7864bf54ad7c03e9464ee87 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 18 Jul 2018 12:23:31 +0200 Subject: [PATCH 2/5] utf8: modernize utf16 inline calls a bit Let's fix an indentation issue. Let's avoid yoda comparisons. Let's drop unnecessary (). Let's make sure we convert 16bit values to 32bit before shifting them by 10bit to the left, to avoid overflows. Let's avoid comparisons between signed literals and unsigned variables, in particular if the literals are outside of the minimum range C requires for "int". --- src/basic/utf8.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/basic/utf8.h b/src/basic/utf8.h index f5e9f8cacb..63991bf8c8 100644 --- a/src/basic/utf8.h +++ b/src/basic/utf8.h @@ -31,15 +31,15 @@ int utf8_encoded_valid_unichar(const char *str); int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar); static inline bool utf16_is_surrogate(char16_t c) { - return (0xd800 <= c && c <= 0xdfff); + return c >= 0xd800U && c <= 0xdfffU; } static inline bool utf16_is_trailing_surrogate(char16_t c) { - return (0xdc00 <= c && c <= 0xdfff); + return c >= 0xdc00U && c <= 0xdfffU; } static inline char32_t utf16_surrogate_pair_to_unichar(char16_t lead, char16_t trail) { - return ((lead - 0xd800) << 10) + (trail - 0xdc00) + 0x10000; + return ((((char32_t) lead - 0xd800U) << 10) + ((char32_t) trail - 0xdc00U) + 0x10000U); } size_t utf8_n_codepoints(const char *str); From 7c4218578466e128a9cff42ff95d58deab16637e Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 18 Jul 2018 12:30:00 +0200 Subject: [PATCH 3/5] utf8: change return type of utf8_encoded_expected_len() to size_t After all it returns a lengths of a string in chars, and hence should return size_t, exactly like strlen(). --- src/basic/utf8.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/basic/utf8.c b/src/basic/utf8.c index 40d4fbda62..f6ab26d826 100644 --- a/src/basic/utf8.c +++ b/src/basic/utf8.c @@ -61,12 +61,12 @@ static bool unichar_is_control(char32_t ch) { } /* count of characters used to encode one unicode char */ -static int utf8_encoded_expected_len(const char *str) { - unsigned char c; +static size_t utf8_encoded_expected_len(const char *str) { + uint8_t c; assert(str); - c = (unsigned char) str[0]; + c = (uint8_t) str[0]; if (c < 0x80) return 1; if ((c & 0xe0) == 0xc0) @@ -86,7 +86,7 @@ static int utf8_encoded_expected_len(const char *str) { /* decode one unicode char */ int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar) { char32_t unichar; - int len, i; + size_t len, i; assert(str); @@ -118,6 +118,7 @@ int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar) { for (i = 1; i < len; i++) { if (((char32_t)str[i] & 0xc0) != 0x80) return -EINVAL; + unichar <<= 6; unichar |= (char32_t)str[i] & 0x3f; } @@ -377,8 +378,9 @@ static int utf8_unichar_to_encoded_len(char32_t unichar) { /* validate one encoded unicode char and return its length */ int utf8_encoded_valid_unichar(const char *str) { - int len, i, r; char32_t unichar; + size_t len, i; + int r; assert(str); @@ -400,14 +402,14 @@ int utf8_encoded_valid_unichar(const char *str) { return r; /* check if encoded length matches encoded value */ - if (utf8_unichar_to_encoded_len(unichar) != len) + if (utf8_unichar_to_encoded_len(unichar) != (int) len) return -EINVAL; /* check if value has valid range */ if (!unichar_is_valid(unichar)) return -EINVAL; - return len; + return (int) len; } size_t utf8_n_codepoints(const char *str) { From 2ac2ff3fc1af850bf5159a652cf0005f634aebf9 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 25 Jun 2018 19:16:43 +0200 Subject: [PATCH 4/5] utf8: let's update utf16_to_utf8() a bit Let's change utf16_to_utf8() prototype to refer to utf16 chars with char16_t rather than void Let's not cast away a "const" needlessly. Let's add a few comments. Let's fix the calculations of the buffer size to allocate, and how long to run the loop in case of uneven byte numbers --- src/basic/utf8.c | 23 +++++++++++++++-------- src/basic/utf8.h | 2 +- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/basic/utf8.c b/src/basic/utf8.c index f6ab26d826..7dc84c1157 100644 --- a/src/basic/utf8.c +++ b/src/basic/utf8.c @@ -314,18 +314,25 @@ size_t utf8_encode_unichar(char *out_utf8, char32_t g) { return 0; } -char *utf16_to_utf8(const void *s, size_t length) { +char *utf16_to_utf8(const char16_t *s, size_t length /* bytes! */) { const uint8_t *f; char *r, *t; - r = new(char, (length * 4 + 1) / 2 + 1); + assert(s); + + /* Input length is in bytes, i.e. the shortest possible character takes 2 bytes. Each unicode character may + * take up to 4 bytes in UTF-8. Let's also account for a trailing NUL byte. */ + if (length * 2 < length) + return NULL; /* overflow */ + + r = new(char, length * 2 + 1); if (!r) return NULL; - f = s; + f = (const uint8_t*) s; t = r; - while (f < (const uint8_t*) s + length) { + while (f + 1 < (const uint8_t*) s + length) { char16_t w1, w2; /* see RFC 2781 section 2.2 */ @@ -335,13 +342,13 @@ char *utf16_to_utf8(const void *s, size_t length) { if (!utf16_is_surrogate(w1)) { t += utf8_encode_unichar(t, w1); - continue; } if (utf16_is_trailing_surrogate(w1)) - continue; - else if (f >= (const uint8_t*) s + length) + continue; /* spurious trailing surrogate, ignore */ + + if (f + 1 >= (const uint8_t*) s + length) break; w2 = f[1] << 8 | f[0]; @@ -349,7 +356,7 @@ char *utf16_to_utf8(const void *s, size_t length) { if (!utf16_is_trailing_surrogate(w2)) { f -= 2; - continue; + continue; /* surrogate missing its trailing surrogate, ignore */ } t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2)); diff --git a/src/basic/utf8.h b/src/basic/utf8.h index 63991bf8c8..13c48b0978 100644 --- a/src/basic/utf8.h +++ b/src/basic/utf8.h @@ -25,7 +25,7 @@ char *utf8_escape_invalid(const char *s); char *utf8_escape_non_printable(const char *str); size_t utf8_encode_unichar(char *out_utf8, char32_t g); -char *utf16_to_utf8(const void *s, size_t length); +char *utf16_to_utf8(const char16_t *s, size_t length /* bytes! */); int utf8_encoded_valid_unichar(const char *str); int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar); From 80b0a5972999caacab538f2f46fa8287a30caacd Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 25 Jun 2018 19:17:42 +0200 Subject: [PATCH 5/5] utf8: add utf8_to_utf16() helper --- src/basic/utf8.c | 73 ++++++++++++++++++++++++++++++++++++++++++++ src/basic/utf8.h | 5 +++ src/test/test-utf8.c | 44 +++++++++++++++++++++++--- 3 files changed, 117 insertions(+), 5 deletions(-) diff --git a/src/basic/utf8.c b/src/basic/utf8.c index 7dc84c1157..e0d1949dc7 100644 --- a/src/basic/utf8.c +++ b/src/basic/utf8.c @@ -366,6 +366,79 @@ char *utf16_to_utf8(const char16_t *s, size_t length /* bytes! */) { return r; } +size_t utf16_encode_unichar(char16_t *out, char32_t c) { + + /* Note that this encodes as little-endian. */ + + switch (c) { + + case 0 ... 0xd7ffU: + case 0xe000U ... 0xffffU: + out[0] = htole16(c); + return 1; + + case 0x10000U ... 0x10ffffU: + c -= 0x10000U; + out[0] = htole16((c >> 10) + 0xd800U); + out[1] = htole16((c & 0x3ffU) + 0xdc00U); + return 2; + + default: /* A surrogate (invalid) */ + return 0; + } +} + +char16_t *utf8_to_utf16(const char *s, size_t length) { + char16_t *n, *p; + size_t i; + int r; + + assert(s); + + n = new(char16_t, length + 1); + if (!n) + return NULL; + + p = n; + + for (i = 0; i < length;) { + char32_t unichar; + size_t e; + + e = utf8_encoded_expected_len(s + i); + if (e <= 1) /* Invalid and single byte characters are copied as they are */ + goto copy; + + if (i + e > length) /* sequence longer than input buffer, then copy as-is */ + goto copy; + + r = utf8_encoded_to_unichar(s + i, &unichar); + if (r < 0) /* sequence invalid, then copy as-is */ + goto copy; + + p += utf16_encode_unichar(p, unichar); + i += e; + continue; + + copy: + *(p++) = htole16(s[i++]); + } + + *p = 0; + return n; +} + +size_t char16_strlen(const char16_t *s) { + size_t n = 0; + + assert(s); + + while (*s != 0) + n++, s++; + + return n; +} + /* expected size used to encode one unicode char */ static int utf8_unichar_to_encoded_len(char32_t unichar) { diff --git a/src/basic/utf8.h b/src/basic/utf8.h index 13c48b0978..69a816e125 100644 --- a/src/basic/utf8.h +++ b/src/basic/utf8.h @@ -25,7 +25,12 @@ char *utf8_escape_invalid(const char *s); char *utf8_escape_non_printable(const char *str); size_t utf8_encode_unichar(char *out_utf8, char32_t g); +size_t utf16_encode_unichar(char16_t *out, char32_t c); + char *utf16_to_utf8(const char16_t *s, size_t length /* bytes! */); +char16_t *utf8_to_utf16(const char *s, size_t length); + +size_t char16_strlen(const char16_t *s); int utf8_encoded_valid_unichar(const char *str); int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar); diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c index d35daf53dc..9849530ac8 100644 --- a/src/test/test-utf8.c +++ b/src/test/test-utf8.c @@ -2,6 +2,7 @@ #include "alloc-util.h" #include "string-util.h" +#include "strv.h" #include "utf8.h" #include "util.h" @@ -87,15 +88,25 @@ static void test_utf8_escaping_printable(void) { } static void test_utf16_to_utf8(void) { - char *a = NULL; - const uint16_t utf16[] = { htole16('a'), htole16(0xd800), htole16('b'), htole16(0xdc00), htole16('c'), htole16(0xd801), htole16(0xdc37) }; - const char utf8[] = { 'a', 'b', 'c', 0xf0, 0x90, 0x90, 0xb7, 0 }; + const char16_t utf16[] = { htole16('a'), htole16(0xd800), htole16('b'), htole16(0xdc00), htole16('c'), htole16(0xd801), htole16(0xdc37) }; + static const char utf8[] = { 'a', 'b', 'c', 0xf0, 0x90, 0x90, 0xb7 }; + _cleanup_free_ char16_t *b = NULL; + _cleanup_free_ char *a = NULL; - a = utf16_to_utf8(utf16, 14); + /* Convert UTF-16 to UTF-8, filtering embedded bad chars */ + a = utf16_to_utf8(utf16, sizeof(utf16)); assert_se(a); - assert_se(streq(a, utf8)); + assert_se(memcmp(a, utf8, sizeof(utf8)) == 0); + + /* Convert UTF-8 to UTF-16, and back */ + b = utf8_to_utf16(utf8, sizeof(utf8)); + assert_se(b); free(a); + a = utf16_to_utf8(b, char16_strlen(b) * 2); + assert_se(a); + assert_se(strlen(a) == sizeof(utf8)); + assert_se(memcmp(a, utf8, sizeof(utf8)) == 0); } static void test_utf8_n_codepoints(void) { @@ -116,6 +127,28 @@ static void test_utf8_console_width(void) { assert_se(utf8_console_width("\xF1") == (size_t) -1); } +static void test_utf8_to_utf16(void) { + const char *p; + + FOREACH_STRING(p, + "abc", + "zażółcić gęślą jaźń", + "串", + "", + "…👊🔪💐…") { + + _cleanup_free_ char16_t *a = NULL; + _cleanup_free_ char *b = NULL; + + a = utf8_to_utf16(p, strlen(p)); + assert_se(a); + + b = utf16_to_utf8(a, char16_strlen(a) * 2); + assert_se(b); + assert_se(streq(p, b)); + } +} + int main(int argc, char *argv[]) { test_utf8_is_valid(); test_utf8_is_printable(); @@ -127,6 +160,7 @@ int main(int argc, char *argv[]) { test_utf16_to_utf8(); test_utf8_n_codepoints(); test_utf8_console_width(); + test_utf8_to_utf16(); return 0; }