From 52a8f9295b828872586c5b9e5587064a25dae9b2 Mon Sep 17 00:00:00 2001 From: Nikola Knezevic Date: Tue, 7 Jan 2020 00:06:49 +0100 Subject: [PATCH] Add support for \u escape in fromJSON As fromTOML supports \u and \U escapes, bring fromJSON on par. As JSON defaults to UTF-8 encoding (every JSON parser must support UTF-8), this change parses the `\u hex hex hex hex` sequence (\u followed by 4 hexadecimal digits) into an UTF-8 representation. Add a test to verify correct parsing, using all escape sequences from json.org. --- src/libexpr/json-to-value.cc | 88 ++++++++++++++++++++++- tests/lang/eval-okay-fromjson-escapes.exp | 1 + tests/lang/eval-okay-fromjson-escapes.nix | 3 + 3 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 tests/lang/eval-okay-fromjson-escapes.exp create mode 100644 tests/lang/eval-okay-fromjson-escapes.nix diff --git a/src/libexpr/json-to-value.cc b/src/libexpr/json-to-value.cc index 96cd0fc72..47cab2bb5 100644 --- a/src/libexpr/json-to-value.cc +++ b/src/libexpr/json-to-value.cc @@ -11,6 +11,87 @@ static void skipWhitespace(const char * & s) } +/* + Parse an unicode escape sequence (4 hex characters following \u) in JSON string +*/ +static string parseUnicodeEscapeSequence(const char * & s) +{ + int codepoint = 0; + + const auto factors = { 12u, 8u, 4u, 0u }; + for (const auto factor : factors) + { + if (!*s) throw JSONParseError("got end-of-string in JSON string while parsing \\u sequence"); + + if (*s >= '0' and *s <= '9') { + codepoint += static_cast((static_cast(*s) - 0x30u) << factor); + } else if (*s >= 'A' and *s <= 'F') { + codepoint += static_cast((static_cast(*s) - 0x37u) << factor); + } else if (*s >= 'a' and *s <= 'f') { + codepoint += static_cast((static_cast(*s) - 0x57u) << factor); + } else { + throw JSONParseError(format("illegal character '%1%' in \\u escape sequence.") % *s); + } + s++; + } + + if ((codepoint > 0xd7ff && codepoint < 0xe000) || codepoint > 0x10ffff) { + throw JSONParseError("Unicode escape sequence is not a Unicode scalar value"); + } + + // taken from cpptoml.h + std::string result; + // See Table 3-6 of the Unicode standard + if (codepoint <= 0x7f) + { + // 1-byte codepoints: 00000000 0xxxxxxx + // repr: 0xxxxxxx + result += static_cast(codepoint & 0x7f); + } + else if (codepoint <= 0x7ff) + { + // 2-byte codepoints: 00000yyy yyxxxxxx + // repr: 110yyyyy 10xxxxxx + // + // 0x1f = 00011111 + // 0xc0 = 11000000 + // + result += static_cast(0xc0 | ((codepoint >> 6) & 0x1f)); + // + // 0x80 = 10000000 + // 0x3f = 00111111 + // + result += static_cast(0x80 | (codepoint & 0x3f)); + } + else if (codepoint <= 0xffff) + { + // 3-byte codepoints: zzzzyyyy yyxxxxxx + // repr: 1110zzzz 10yyyyyy 10xxxxxx + // + // 0xe0 = 11100000 + // 0x0f = 00001111 + // + result += static_cast(0xe0 | ((codepoint >> 12) & 0x0f)); + result += static_cast(0x80 | ((codepoint >> 6) & 0x1f)); + result += static_cast(0x80 | (codepoint & 0x3f)); + } + else + { + // 4-byte codepoints: 000uuuuu zzzzyyyy yyxxxxxx + // repr: 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx + // + // 0xf0 = 11110000 + // 0x07 = 00000111 + // + result += static_cast(0xf0 | ((codepoint >> 18) & 0x07)); + result += static_cast(0x80 | ((codepoint >> 12) & 0x3f)); + result += static_cast(0x80 | ((codepoint >> 6) & 0x3f)); + result += static_cast(0x80 | (codepoint & 0x3f)); + } + return result; +} + + static string parseJSONString(const char * & s) { string res; @@ -27,8 +108,11 @@ static string parseJSONString(const char * & s) else if (*s == 'n') res += '\n'; else if (*s == 'r') res += '\r'; else if (*s == 't') res += '\t'; - else if (*s == 'u') throw JSONParseError("\\u characters in JSON strings are currently not supported"); - else throw JSONParseError("invalid escaped character in JSON string"); + else if (*s == 'u') { + res += parseUnicodeEscapeSequence(++s); + // to neuter the outside s++ + s--; + } else throw JSONParseError("invalid escaped character in JSON string"); s++; } else res += *s++; diff --git a/tests/lang/eval-okay-fromjson-escapes.exp b/tests/lang/eval-okay-fromjson-escapes.exp new file mode 100644 index 000000000..add5505a8 --- /dev/null +++ b/tests/lang/eval-okay-fromjson-escapes.exp @@ -0,0 +1 @@ +"quote \" reverse solidus \\ solidus / backspace  formfeed newline \n carriage return \r horizontal tab \t 1 char unicode encoded backspace  1 char unicode encoded e with accent é 2 char unicode encoded s with caron š 3 char unicode encoded rightwards arrow →" diff --git a/tests/lang/eval-okay-fromjson-escapes.nix b/tests/lang/eval-okay-fromjson-escapes.nix new file mode 100644 index 000000000..f00713507 --- /dev/null +++ b/tests/lang/eval-okay-fromjson-escapes.nix @@ -0,0 +1,3 @@ +# This string contains all supported escapes in a JSON string, per json.org +# \b and \f are not supported by Nix +builtins.fromJSON ''"quote \" reverse solidus \\ solidus \/ backspace \b formfeed \f newline \n carriage return \r horizontal tab \t 1 char unicode encoded backspace \u0008 1 char unicode encoded e with accent \u00e9 2 char unicode encoded s with caron \u0161 3 char unicode encoded rightwards arrow \u2192"''