terminal: add parser state-machine

The term-parser is used to parse any input from TTY-clients. It reads CSI, DCS, OSC and ST control sequences and normal escape sequences. It doesn't do anything with the parsed data besides detecting the sequence and returning it. The caller has to react to them. The parser also comes with its own UTF-8 helpers. The reason for that is that we don't want to assert() or hard-fail on parsing errors. Instead, we treat any invalid UTF-8 sequences as ISO-8859-1. This allows pasting invalid data into a terminal (which cannot be controlled through the TTY, anyway) and we still deal with it in a proper manner. This is _required_ for 8-bit and 7-bit DEC modes (including the g0-g3 mappings), so it's not just an ugly fallback because we can (it's still horribly ugly but at least we have an excuse).
2014-06-15 14:50:00 +02:00 · 2014-06-15 14:50:00 +02:00 · 1c9633d669
parent 28622e8f5b
commit 1c9633d669
6 changed files with 2622 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -220,6 +220,7 @@
 /test-strxcpyx
 /test-tables
 /test-term-page
+/test-term-parser
 /test-time
 /test-tmpfiles
 /test-udev
--- a/Makefile.am
+++ b/Makefile.am
@ -2839,7 +2839,9 @@ libsystemd_terminal_la_CFLAGS = \

 libsystemd_terminal_la_SOURCES = \
 	src/libsystemd-terminal/term-internal.h \
+	src/libsystemd-terminal/term-charset.c \
 	src/libsystemd-terminal/term-page.c \
+	src/libsystemd-terminal/term-parser.c \
 	src/libsystemd-terminal/term-wcwidth.c

 libsystemd_terminal_la_LIBADD = \
@ -2854,8 +2856,17 @@ test_term_page_LDADD = \
 	libsystemd-internal.la \
 	libsystemd-shared.la

+test_term_parser_SOURCES = \
+	src/libsystemd-terminal/test-term-parser.c
+
+test_term_parser_LDADD = \
+	libsystemd-terminal.la \
+	libsystemd-internal.la \
+	libsystemd-shared.la
+
 tests += \
-	test-term-page
+	test-term-page \
+	test-term-parser

 # ------------------------------------------------------------------------------
 if ENABLE_GTK_DOC
--- a/src/libsystemd-terminal/term-charset.c
+++ b/src/libsystemd-terminal/term-charset.c
@ -0,0 +1,491 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+  This file is part of systemd.
+
+  Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com>
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+/*
+ * VTE Character Sets
+ * These are predefined charactersets that can be loaded into GL and GR. By
+ * default we use unicode_lower and unicode_upper, that is, both sets have the
+ * exact unicode mapping. unicode_lower is effectively ASCII and unicode_upper
+ * as defined by the unicode standard (I guess, ISO 8859-1).
+ * Several other character sets are defined here. However, all of them are
+ * limited to the 96 character space of GL or GR. Everything beyond GR (which
+ * was not supported by the classic VTs by DEC but is available in VT emulators
+ * that support unicode/UTF8) is always mapped to unicode and cannot be changed
+ * by these character sets. Even mapping GL and GR is only available for
+ * backwards compatibility as new applications can use the Unicode functionality
+ * of the VTE.
+ *
+ * Moreover, mapping GR is almost unnecessary to support. In fact, Unicode UTF-8
+ * support in VTE works by reading every incoming data as UTF-8 stream. This
+ * maps GL/ASCII to ASCII, as UTF-8 is backwards compatible to ASCII, however,
+ * everything that has the 8th bit set is a >=2-byte haracter in UTF-8. That is,
+ * this is in no way backwards compatible to >=VT220 8bit support. Therefore, if
+ * someone maps a character set into GR and wants to use them with this VTE,
+ * then they must already send UTF-8 characters to use GR (all GR characters are
+ * 8-bits). Hence, they can easily also send the correct UTF-8 character for the
+ * unicode mapping.
+ * The only advantage is that most characters in many sets are 3-byte UTF-8
+ * characters and by mapping the set into GR/GL you can use 2 or 1 byte UTF-8
+ * characters which saves bandwidth.
+ * Another reason is, if you have older applications that use the VT220 8-bit
+ * support and you put a ASCII/8bit-extension to UTF-8 converter in between, you
+ * need these mappings to have the application behave correctly if it uses GL/GR
+ * mappings extensively.
+ *
+ * Anyway, we support GL/GR mappings so here are the most commonly used maps as
+ * defined by Unicode-standard, DEC-private maps and other famous charmaps.
+ *
+ * Characters 1-32 are always the control characters (part of CL) and cannot be
+ * mapped. Characters 34-127 (94 characters) are part of GL and can be mapped.
+ * Characters 33 and 128 are not part of GL and always mapped by the VTE.
+ * However, for GR they can be mapped differently (96 chars) so we have to
+ * include them. The mapper has to take care not to use them in GL.
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include "term-internal.h"
+
+/*
+ * Lower Unicode character set. This maps the characters to the basic ASCII
+ * characters 33-126. These are all graphics characters defined in ASCII.
+ */
+term_charset term_unicode_lower = {
+        [0] = 32,
+        [1] = 33,
+        [2] = 34,
+        [3] = 35,
+        [4] = 36,
+        [5] = 37,
+        [6] = 38,
+        [7] = 39,
+        [8] = 40,
+        [9] = 41,
+        [10] = 42,
+        [11] = 43,
+        [12] = 44,
+        [13] = 45,
+        [14] = 46,
+        [15] = 47,
+        [16] = 48,
+        [17] = 49,
+        [18] = 50,
+        [19] = 51,
+        [20] = 52,
+        [21] = 53,
+        [22] = 54,
+        [23] = 55,
+        [24] = 56,
+        [25] = 57,
+        [26] = 58,
+        [27] = 59,
+        [28] = 60,
+        [29] = 61,
+        [30] = 62,
+        [31] = 63,
+        [32] = 64,
+        [33] = 65,
+        [34] = 66,
+        [35] = 67,
+        [36] = 68,
+        [37] = 69,
+        [38] = 70,
+        [39] = 71,
+        [40] = 72,
+        [41] = 73,
+        [42] = 74,
+        [43] = 75,
+        [44] = 76,
+        [45] = 77,
+        [46] = 78,
+        [47] = 79,
+        [48] = 80,
+        [49] = 81,
+        [50] = 82,
+        [51] = 83,
+        [52] = 84,
+        [53] = 85,
+        [54] = 86,
+        [55] = 87,
+        [56] = 88,
+        [57] = 89,
+        [58] = 90,
+        [59] = 91,
+        [60] = 92,
+        [61] = 93,
+        [62] = 94,
+        [63] = 95,
+        [64] = 96,
+        [65] = 97,
+        [66] = 98,
+        [67] = 99,
+        [68] = 100,
+        [69] = 101,
+        [70] = 102,
+        [71] = 103,
+        [72] = 104,
+        [73] = 105,
+        [74] = 106,
+        [75] = 107,
+        [76] = 108,
+        [77] = 109,
+        [78] = 110,
+        [79] = 111,
+        [80] = 112,
+        [81] = 113,
+        [82] = 114,
+        [83] = 115,
+        [84] = 116,
+        [85] = 117,
+        [86] = 118,
+        [87] = 119,
+        [88] = 120,
+        [89] = 121,
+        [90] = 122,
+        [91] = 123,
+        [92] = 124,
+        [93] = 125,
+        [94] = 126,
+        [95] = 127,
+};
+
+/*
+ * Upper Unicode Table
+ * This maps all characters to the upper unicode characters 161-254. These are
+ * not compatible to any older 8 bit character sets. See the Unicode standard
+ * for the definitions of each symbol.
+ */
+term_charset term_unicode_upper = {
+        [0] = 160,
+        [1] = 161,
+        [2] = 162,
+        [3] = 163,
+        [4] = 164,
+        [5] = 165,
+        [6] = 166,
+        [7] = 167,
+        [8] = 168,
+        [9] = 169,
+        [10] = 170,
+        [11] = 171,
+        [12] = 172,
+        [13] = 173,
+        [14] = 174,
+        [15] = 175,
+        [16] = 176,
+        [17] = 177,
+        [18] = 178,
+        [19] = 179,
+        [20] = 180,
+        [21] = 181,
+        [22] = 182,
+        [23] = 183,
+        [24] = 184,
+        [25] = 185,
+        [26] = 186,
+        [27] = 187,
+        [28] = 188,
+        [29] = 189,
+        [30] = 190,
+        [31] = 191,
+        [32] = 192,
+        [33] = 193,
+        [34] = 194,
+        [35] = 195,
+        [36] = 196,
+        [37] = 197,
+        [38] = 198,
+        [39] = 199,
+        [40] = 200,
+        [41] = 201,
+        [42] = 202,
+        [43] = 203,
+        [44] = 204,
+        [45] = 205,
+        [46] = 206,
+        [47] = 207,
+        [48] = 208,
+        [49] = 209,
+        [50] = 210,
+        [51] = 211,
+        [52] = 212,
+        [53] = 213,
+        [54] = 214,
+        [55] = 215,
+        [56] = 216,
+        [57] = 217,
+        [58] = 218,
+        [59] = 219,
+        [60] = 220,
+        [61] = 221,
+        [62] = 222,
+        [63] = 223,
+        [64] = 224,
+        [65] = 225,
+        [66] = 226,
+        [67] = 227,
+        [68] = 228,
+        [69] = 229,
+        [70] = 230,
+        [71] = 231,
+        [72] = 232,
+        [73] = 233,
+        [74] = 234,
+        [75] = 235,
+        [76] = 236,
+        [77] = 237,
+        [78] = 238,
+        [79] = 239,
+        [80] = 240,
+        [81] = 241,
+        [82] = 242,
+        [83] = 243,
+        [84] = 244,
+        [85] = 245,
+        [86] = 246,
+        [87] = 247,
+        [88] = 248,
+        [89] = 249,
+        [90] = 250,
+        [91] = 251,
+        [92] = 252,
+        [93] = 253,
+        [94] = 254,
+        [95] = 255,
+};
+
+/*
+ * The DEC supplemental graphics set. For its definition see here:
+ *  http://vt100.net/docs/vt220-rm/table2-3b.html
+ * Its basically a mixture of common European symbols that are not part of
+ * ASCII. Most often, this is mapped into GR to extend the basci ASCII part.
+ *
+ * This is very similar to unicode_upper, however, few symbols differ so do not
+ * mix them up!
+ */
+term_charset term_dec_supplemental_graphics = {
+        [0] = -1,       /* undefined */
+        [1] = 161,
+        [2] = 162,
+        [3] = 163,
+        [4] = 0,
+        [5] = 165,
+        [6] = 0,
+        [7] = 167,
+        [8] = 164,
+        [9] = 169,
+        [10] = 170,
+        [11] = 171,
+        [12] = 0,
+        [13] = 0,
+        [14] = 0,
+        [15] = 0,
+        [16] = 176,
+        [17] = 177,
+        [18] = 178,
+        [19] = 179,
+        [20] = 0,
+        [21] = 181,
+        [22] = 182,
+        [23] = 183,
+        [24] = 0,
+        [25] = 185,
+        [26] = 186,
+        [27] = 187,
+        [28] = 188,
+        [29] = 189,
+        [30] = 0,
+        [31] = 191,
+        [32] = 192,
+        [33] = 193,
+        [34] = 194,
+        [35] = 195,
+        [36] = 196,
+        [37] = 197,
+        [38] = 198,
+        [39] = 199,
+        [40] = 200,
+        [41] = 201,
+        [42] = 202,
+        [43] = 203,
+        [44] = 204,
+        [45] = 205,
+        [46] = 206,
+        [47] = 207,
+        [48] = 0,
+        [49] = 209,
+        [50] = 210,
+        [51] = 211,
+        [52] = 212,
+        [53] = 213,
+        [54] = 214,
+        [55] = 338,
+        [56] = 216,
+        [57] = 217,
+        [58] = 218,
+        [59] = 219,
+        [60] = 220,
+        [61] = 376,
+        [62] = 0,
+        [63] = 223,
+        [64] = 224,
+        [65] = 225,
+        [66] = 226,
+        [67] = 227,
+        [68] = 228,
+        [69] = 229,
+        [70] = 230,
+        [71] = 231,
+        [72] = 232,
+        [73] = 233,
+        [74] = 234,
+        [75] = 235,
+        [76] = 236,
+        [77] = 237,
+        [78] = 238,
+        [79] = 239,
+        [80] = 0,
+        [81] = 241,
+        [82] = 242,
+        [83] = 243,
+        [84] = 244,
+        [85] = 245,
+        [86] = 246,
+        [87] = 339,
+        [88] = 248,
+        [89] = 249,
+        [90] = 250,
+        [91] = 251,
+        [92] = 252,
+        [93] = 255,
+        [94] = 0,
+        [95] = -1,       /* undefined */
+};
+
+/*
+ * DEC special graphics character set. See here for its definition:
+ *  http://vt100.net/docs/vt220-rm/table2-4.html
+ * This contains several characters to create ASCII drawings and similar. Its
+ * commonly mapped into GR to extend the basic ASCII characters.
+ *
+ * Lower 62 characters map to ASCII 33-64, everything beyond is special and
+ * commonly used for ASCII drawings. It depends on the Unicode Standard 3.2 for
+ * the extended horizontal scan-line characters 3, 5, 7, and 9.
+ */
+term_charset term_dec_special_graphics = {
+        [0] = -1,       /* undefined */
+        [1] = 33,
+        [2] = 34,
+        [3] = 35,
+        [4] = 36,
+        [5] = 37,
+        [6] = 38,
+        [7] = 39,
+        [8] = 40,
+        [9] = 41,
+        [10] = 42,
+        [11] = 43,
+        [12] = 44,
+        [13] = 45,
+        [14] = 46,
+        [15] = 47,
+        [16] = 48,
+        [17] = 49,
+        [18] = 50,
+        [19] = 51,
+        [20] = 52,
+        [21] = 53,
+        [22] = 54,
+        [23] = 55,
+        [24] = 56,
+        [25] = 57,
+        [26] = 58,
+        [27] = 59,
+        [28] = 60,
+        [29] = 61,
+        [30] = 62,
+        [31] = 63,
+        [32] = 64,
+        [33] = 65,
+        [34] = 66,
+        [35] = 67,
+        [36] = 68,
+        [37] = 69,
+        [38] = 70,
+        [39] = 71,
+        [40] = 72,
+        [41] = 73,
+        [42] = 74,
+        [43] = 75,
+        [44] = 76,
+        [45] = 77,
+        [46] = 78,
+        [47] = 79,
+        [48] = 80,
+        [49] = 81,
+        [50] = 82,
+        [51] = 83,
+        [52] = 84,
+        [53] = 85,
+        [54] = 86,
+        [55] = 87,
+        [56] = 88,
+        [57] = 89,
+        [58] = 90,
+        [59] = 91,
+        [60] = 92,
+        [61] = 93,
+        [62] = 94,
+        [63] = 0,
+        [64] = 9830,
+        [65] = 9618,
+        [66] = 9225,
+        [67] = 9228,
+        [68] = 9229,
+        [69] = 9226,
+        [70] = 176,
+        [71] = 177,
+        [72] = 9252,
+        [73] = 9227,
+        [74] = 9496,
+        [75] = 9488,
+        [76] = 9484,
+        [77] = 9492,
+        [78] = 9532,
+        [79] = 9146,
+        [80] = 9147,
+        [81] = 9472,
+        [82] = 9148,
+        [83] = 9149,
+        [84] = 9500,
+        [85] = 9508,
+        [86] = 9524,
+        [87] = 9516,
+        [88] = 9474,
+        [89] = 8804,
+        [90] = 8805,
+        [91] = 960,
+        [92] = 8800,
+        [93] = 163,
+        [94] = 8901,
+        [95] = -1,      /* undefined */
+};
--- a/src/libsystemd-terminal/term-internal.h
+++ b/src/libsystemd-terminal/term-internal.h
@ -37,6 +37,11 @@ typedef struct term_line term_line;
 typedef struct term_page term_page;
 typedef struct term_history term_history;

+typedef struct term_utf8 term_utf8;
+typedef struct term_seq term_seq;
+typedef struct term_parser term_parser;
+typedef uint32_t term_charset[96];
+
 /*
 * Miscellaneous
 * Sundry things and external helpers.
@ -335,3 +340,347 @@ void term_history_trim(term_history *history, unsigned int max);
 void term_history_push(term_history *history, term_line *line);
 term_line *term_history_pop(term_history *history, unsigned int reserve_width, const term_attr *attr, term_age_t age);
 unsigned int term_history_peek(term_history *history, unsigned int max, unsigned int reserve_width, const term_attr *attr, term_age_t age);
+
+/*
+ * UTF-8
+ * The UTF-decoder and encoder are adjusted for terminals and provide proper
+ * fallbacks for invalid UTF-8. In terminals it's quite usual to use fallbacks
+ * instead of rejecting invalid input. This way, old legacy applications still
+ * work (this is especially important for 7bit/ASCII DEC modes).
+ */
+
+struct term_utf8 {
+        uint32_t chars[5];
+        uint32_t ucs4;
+
+        unsigned int i_bytes : 3;
+        unsigned int n_bytes : 3;
+        unsigned int valid : 1;
+};
+
+size_t term_utf8_encode(char *out_utf8, uint32_t g);
+const uint32_t *term_utf8_decode(term_utf8 *p, size_t *out_len, char c);
+
+/*
+ * Parsers
+ * The term_parser object parses control-sequences for both host and terminal
+ * side. Based on this parser, there is a set of command-parsers that take a
+ * term_seq sequence and returns the command it represents. This is different
+ * for host and terminal side so a different set of parsers is provided.
+ */
+
+enum {
+        TERM_SEQ_NONE,                  /* placeholder, no sequence parsed */
+
+        TERM_SEQ_IGNORE,                /* no-op character */
+        TERM_SEQ_GRAPHIC,               /* graphic character */
+        TERM_SEQ_CONTROL,               /* control character */
+        TERM_SEQ_ESCAPE,                /* escape sequence */
+        TERM_SEQ_CSI,                   /* control sequence function */
+        TERM_SEQ_DCS,                   /* device control string */
+        TERM_SEQ_OSC,                   /* operating system control */
+
+        TERM_SEQ_CNT
+};
+
+enum {
+        /* these must be kept compatible to (1U << (ch - 0x20)) */
+
+        TERM_SEQ_FLAG_SPACE             = (1U <<  0),   /* char:   */
+        TERM_SEQ_FLAG_BANG              = (1U <<  1),   /* char: ! */
+        TERM_SEQ_FLAG_DQUOTE            = (1U <<  2),   /* char: " */
+        TERM_SEQ_FLAG_HASH              = (1U <<  3),   /* char: # */
+        TERM_SEQ_FLAG_CASH              = (1U <<  4),   /* char: $ */
+        TERM_SEQ_FLAG_PERCENT           = (1U <<  5),   /* char: % */
+        TERM_SEQ_FLAG_AND               = (1U <<  6),   /* char: & */
+        TERM_SEQ_FLAG_SQUOTE            = (1U <<  7),   /* char: ' */
+        TERM_SEQ_FLAG_POPEN             = (1U <<  8),   /* char: ( */
+        TERM_SEQ_FLAG_PCLOSE            = (1U <<  9),   /* char: ) */
+        TERM_SEQ_FLAG_MULT              = (1U << 10),   /* char: * */
+        TERM_SEQ_FLAG_PLUS              = (1U << 11),   /* char: + */
+        TERM_SEQ_FLAG_COMMA             = (1U << 12),   /* char: , */
+        TERM_SEQ_FLAG_MINUS             = (1U << 13),   /* char: - */
+        TERM_SEQ_FLAG_DOT               = (1U << 14),   /* char: . */
+        TERM_SEQ_FLAG_SLASH             = (1U << 15),   /* char: / */
+
+        /* 16-35 is reserved for numbers; unused */
+
+        /* COLON is reserved            = (1U << 26),      char: : */
+        /* SEMICOLON is reserved        = (1U << 27),      char: ; */
+        TERM_SEQ_FLAG_LT                = (1U << 28),   /* char: < */
+        TERM_SEQ_FLAG_EQUAL             = (1U << 29),   /* char: = */
+        TERM_SEQ_FLAG_GT                = (1U << 30),   /* char: > */
+        TERM_SEQ_FLAG_WHAT              = (1U << 31),   /* char: ? */
+};
+
+enum {
+        TERM_CMD_NONE,                          /* placeholder */
+        TERM_CMD_GRAPHIC,                       /* graphics character */
+
+        TERM_CMD_BEL,                           /* bell */
+        TERM_CMD_BS,                            /* backspace */
+        TERM_CMD_CBT,                           /* cursor-backward-tabulation */
+        TERM_CMD_CHA,                           /* cursor-horizontal-absolute */
+        TERM_CMD_CHT,                           /* cursor-horizontal-forward-tabulation */
+        TERM_CMD_CNL,                           /* cursor-next-line */
+        TERM_CMD_CPL,                           /* cursor-previous-line */
+        TERM_CMD_CR,                            /* carriage-return */
+        TERM_CMD_CUB,                           /* cursor-backward */
+        TERM_CMD_CUD,                           /* cursor-down */
+        TERM_CMD_CUF,                           /* cursor-forward */
+        TERM_CMD_CUP,                           /* cursor-position */
+        TERM_CMD_CUU,                           /* cursor-up */
+        TERM_CMD_DA1,                           /* primary-device-attributes */
+        TERM_CMD_DA2,                           /* secondary-device-attributes */
+        TERM_CMD_DA3,                           /* tertiary-device-attributes */
+        TERM_CMD_DC1,                           /* device-control-1 */
+        TERM_CMD_DC3,                           /* device-control-3 */
+        TERM_CMD_DCH,                           /* delete-character */
+        TERM_CMD_DECALN,                        /* screen-alignment-pattern */
+        TERM_CMD_DECANM,                        /* ansi-mode */
+        TERM_CMD_DECBI,                         /* back-index */
+        TERM_CMD_DECCARA,                       /* change-attributes-in-rectangular-area */
+        TERM_CMD_DECCRA,                        /* copy-rectangular-area */
+        TERM_CMD_DECDC,                         /* delete-column */
+        TERM_CMD_DECDHL_BH,                     /* double-width-double-height-line: bottom half */
+        TERM_CMD_DECDHL_TH,                     /* double-width-double-height-line: top half */
+        TERM_CMD_DECDWL,                        /* double-width-single-height-line */
+        TERM_CMD_DECEFR,
+        TERM_CMD_DECELF,
+        TERM_CMD_DECELR,
+        TERM_CMD_DECERA,
+        TERM_CMD_DECFI,
+        TERM_CMD_DECFRA,
+        TERM_CMD_DECIC,
+        TERM_CMD_DECID,
+        TERM_CMD_DECINVM,
+        TERM_CMD_DECKBD,
+        TERM_CMD_DECKPAM,
+        TERM_CMD_DECKPNM,
+        TERM_CMD_DECLFKC,
+        TERM_CMD_DECLL,
+        TERM_CMD_DECLTOD,
+        TERM_CMD_DECPCTERM,
+        TERM_CMD_DECPKA,
+        TERM_CMD_DECPKFMR,
+        TERM_CMD_DECRARA,
+        TERM_CMD_DECRC,
+        TERM_CMD_DECREQTPARM,
+        TERM_CMD_DECRPKT,
+        TERM_CMD_DECRQCRA,
+        TERM_CMD_DECRQDE,
+        TERM_CMD_DECRQKT,
+        TERM_CMD_DECRQLP,
+        TERM_CMD_DECRQM_ANSI,
+        TERM_CMD_DECRQM_DEC,
+        TERM_CMD_DECRQPKFM,
+        TERM_CMD_DECRQPSR,
+        TERM_CMD_DECRQTSR,
+        TERM_CMD_DECRQUPSS,
+        TERM_CMD_DECSACE,
+        TERM_CMD_DECSASD,
+        TERM_CMD_DECSC,
+        TERM_CMD_DECSCA,
+        TERM_CMD_DECSCL,
+        TERM_CMD_DECSCP,
+        TERM_CMD_DECSCPP,
+        TERM_CMD_DECSCS,
+        TERM_CMD_DECSCUSR,
+        TERM_CMD_DECSDDT,
+        TERM_CMD_DECSDPT,
+        TERM_CMD_DECSED,
+        TERM_CMD_DECSEL,
+        TERM_CMD_DECSERA,
+        TERM_CMD_DECSFC,
+        TERM_CMD_DECSKCV,
+        TERM_CMD_DECSLCK,
+        TERM_CMD_DECSLE,
+        TERM_CMD_DECSLPP,
+        TERM_CMD_DECSLRM_OR_SC,
+        TERM_CMD_DECSMBV,
+        TERM_CMD_DECSMKR,
+        TERM_CMD_DECSNLS,
+        TERM_CMD_DECSPP,
+        TERM_CMD_DECSPPCS,
+        TERM_CMD_DECSPRTT,
+        TERM_CMD_DECSR,
+        TERM_CMD_DECSRFR,
+        TERM_CMD_DECSSCLS,
+        TERM_CMD_DECSSDT,
+        TERM_CMD_DECSSL,
+        TERM_CMD_DECST8C,
+        TERM_CMD_DECSTBM,
+        TERM_CMD_DECSTR,
+        TERM_CMD_DECSTRL,
+        TERM_CMD_DECSWBV,
+        TERM_CMD_DECSWL,
+        TERM_CMD_DECTID,
+        TERM_CMD_DECTME,
+        TERM_CMD_DECTST,
+        TERM_CMD_DL,
+        TERM_CMD_DSR_ANSI,
+        TERM_CMD_DSR_DEC,
+        TERM_CMD_ECH,
+        TERM_CMD_ED,
+        TERM_CMD_EL,
+        TERM_CMD_ENQ,
+        TERM_CMD_EPA,
+        TERM_CMD_FF,
+        TERM_CMD_HPA,
+        TERM_CMD_HPR,
+        TERM_CMD_HT,
+        TERM_CMD_HTS,
+        TERM_CMD_HVP,
+        TERM_CMD_ICH,
+        TERM_CMD_IL,
+        TERM_CMD_IND,
+        TERM_CMD_LF,
+        TERM_CMD_LS1R,
+        TERM_CMD_LS2,
+        TERM_CMD_LS2R,
+        TERM_CMD_LS3,
+        TERM_CMD_LS3R,
+        TERM_CMD_MC_ANSI,
+        TERM_CMD_MC_DEC,
+        TERM_CMD_NEL,
+        TERM_CMD_NP,
+        TERM_CMD_NULL,
+        TERM_CMD_PP,
+        TERM_CMD_PPA,
+        TERM_CMD_PPB,
+        TERM_CMD_PPR,
+        TERM_CMD_RC,
+        TERM_CMD_REP,
+        TERM_CMD_RI,
+        TERM_CMD_RIS,
+        TERM_CMD_RM_ANSI,
+        TERM_CMD_RM_DEC,
+        TERM_CMD_S7C1T,
+        TERM_CMD_S8C1T,
+        TERM_CMD_SCS,
+        TERM_CMD_SD,
+        TERM_CMD_SGR,
+        TERM_CMD_SI,
+        TERM_CMD_SM_ANSI,
+        TERM_CMD_SM_DEC,
+        TERM_CMD_SO,
+        TERM_CMD_SPA,
+        TERM_CMD_SS2,
+        TERM_CMD_SS3,
+        TERM_CMD_ST,
+        TERM_CMD_SU,
+        TERM_CMD_SUB,
+        TERM_CMD_TBC,
+        TERM_CMD_VPA,
+        TERM_CMD_VPR,
+        TERM_CMD_VT,
+        TERM_CMD_XTERM_CLLHP,                   /* xterm-cursor-lower-left-hp-bugfix */
+        TERM_CMD_XTERM_IHMT,                    /* xterm-initiate-highlight-mouse-tracking*/
+        TERM_CMD_XTERM_MLHP,                    /* xterm-memory-lock-hp-bugfix */
+        TERM_CMD_XTERM_MUHP,                    /* xterm-memory-unlock-hp-bugfix */
+        TERM_CMD_XTERM_RPM,                     /* xterm-restore-private-mode */
+        TERM_CMD_XTERM_RRV,                     /* xterm-reset-resource-value */
+        TERM_CMD_XTERM_RTM,                     /* xterm-reset-title-mode */
+        TERM_CMD_XTERM_SACL1,                   /* xterm-set-ansi-conformance-level-1 */
+        TERM_CMD_XTERM_SACL2,                   /* xterm-set-ansi-conformance-level-2 */
+        TERM_CMD_XTERM_SACL3,                   /* xterm-set-ansi-conformance-level-3 */
+        TERM_CMD_XTERM_SDCS,                    /* xterm-set-default-character-set */
+        TERM_CMD_XTERM_SGFX,                    /* xterm-sixel-graphics */
+        TERM_CMD_XTERM_SPM,                     /* xterm-set-private-mode */
+        TERM_CMD_XTERM_SRV,                     /* xterm-set-resource-value */
+        TERM_CMD_XTERM_STM,                     /* xterm-set-title-mode */
+        TERM_CMD_XTERM_SUCS,                    /* xterm-set-utf8-character-set */
+        TERM_CMD_XTERM_WM,                      /* xterm-window-management */
+
+        TERM_CMD_CNT
+};
+
+enum {
+        /*
+         * Charsets: DEC marks charsets according to "Digital Equ. Corp.".
+         *           NRCS marks charsets according to the "National Replacement
+         *           Character Sets". ISO marks charsets according to ISO-8859.
+         * The USERDEF charset is special and can be modified by the host.
+         */
+
+        TERM_CHARSET_NONE,
+
+        /* 96-compat charsets */
+        TERM_CHARSET_ISO_LATIN1_SUPPLEMENTAL,
+        TERM_CHARSET_BRITISH_NRCS = TERM_CHARSET_ISO_LATIN1_SUPPLEMENTAL,
+        TERM_CHARSET_ISO_LATIN2_SUPPLEMENTAL,
+        TERM_CHARSET_AMERICAN_NRCS = TERM_CHARSET_ISO_LATIN2_SUPPLEMENTAL,
+        TERM_CHARSET_ISO_LATIN5_SUPPLEMENTAL,
+        TERM_CHARSET_ISO_GREEK_SUPPLEMENTAL,
+        TERM_CHARSET_ISO_HEBREW_SUPPLEMENTAL,
+        TERM_CHARSET_ISO_LATIN_CYRILLIC,
+
+        TERM_CHARSET_96_CNT,
+
+        /* 94-compat charsets */
+        TERM_CHARSET_DEC_SPECIAL_GRAPHIC = TERM_CHARSET_96_CNT,
+        TERM_CHARSET_DEC_SUPPLEMENTAL,
+        TERM_CHARSET_DEC_TECHNICAL,
+        TERM_CHARSET_CYRILLIC_DEC,
+        TERM_CHARSET_DUTCH_NRCS,
+        TERM_CHARSET_FINNISH_NRCS,
+        TERM_CHARSET_FRENCH_NRCS,
+        TERM_CHARSET_FRENCH_CANADIAN_NRCS,
+        TERM_CHARSET_GERMAN_NRCS,
+        TERM_CHARSET_GREEK_DEC,
+        TERM_CHARSET_GREEK_NRCS,
+        TERM_CHARSET_HEBREW_DEC,
+        TERM_CHARSET_HEBREW_NRCS,
+        TERM_CHARSET_ITALIAN_NRCS,
+        TERM_CHARSET_NORWEGIAN_DANISH_NRCS,
+        TERM_CHARSET_PORTUGUESE_NRCS,
+        TERM_CHARSET_RUSSIAN_NRCS,
+        TERM_CHARSET_SCS_NRCS,
+        TERM_CHARSET_SPANISH_NRCS,
+        TERM_CHARSET_SWEDISH_NRCS,
+        TERM_CHARSET_SWISS_NRCS,
+        TERM_CHARSET_TURKISH_DEC,
+        TERM_CHARSET_TURKISH_NRCS,
+
+        TERM_CHARSET_94_CNT,
+
+        /* special charsets */
+        TERM_CHARSET_USERPREF_SUPPLEMENTAL = TERM_CHARSET_94_CNT,
+
+        TERM_CHARSET_CNT,
+};
+
+extern term_charset term_unicode_lower;
+extern term_charset term_unicode_upper;
+extern term_charset term_dec_supplemental_graphics;
+extern term_charset term_dec_special_graphics;
+
+#define TERM_PARSER_ARG_MAX (16)
+#define TERM_PARSER_ST_MAX (4096)
+
+struct term_seq {
+        unsigned int type;
+        unsigned int command;
+        uint32_t terminator;
+        unsigned int intermediates;
+        unsigned int charset;
+        unsigned int n_args;
+        int args[TERM_PARSER_ARG_MAX];
+        unsigned int n_st;
+        char *st;
+};
+
+struct term_parser {
+        term_seq seq;
+        size_t st_alloc;
+        unsigned int state;
+
+        bool is_host : 1;
+};
+
+int term_parser_new(term_parser **out, bool host);
+term_parser *term_parser_free(term_parser *parser);
+int term_parser_feed(term_parser *parser, const term_seq **seq_out, uint32_t raw);
+
+#define _term_parser_free_ _cleanup_(term_parser_freep)
+DEFINE_TRIVIAL_CLEANUP_FUNC(term_parser*, term_parser_free);
--- a/src/libsystemd-terminal/term-parser.c
+++ b/src/libsystemd-terminal/term-parser.c
--- a/src/libsystemd-terminal/test-term-parser.c
+++ b/src/libsystemd-terminal/test-term-parser.c
@ -0,0 +1,143 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+/***
+  This file is part of systemd.
+
+  Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com>
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+/*
+ * Terminal Parser Tests
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "macro.h"
+#include "term-internal.h"
+#include "util.h"
+
+static void test_term_utf8_invalid(void) {
+        term_utf8 p = { };
+        const uint32_t *res;
+        size_t len;
+
+        res = term_utf8_decode(NULL, NULL, 0);
+        assert_se(res == NULL);
+
+        res = term_utf8_decode(&p, NULL, 0);
+        assert_se(res != NULL);
+
+        len = 5;
+        res = term_utf8_decode(NULL, &len, 0);
+        assert_se(res == NULL);
+        assert_se(len == 0);
+
+        len = 5;
+        res = term_utf8_decode(&p, &len, 0);
+        assert_se(res != NULL);
+        assert_se(len == 1);
+
+        len = 5;
+        res = term_utf8_decode(&p, &len, 0xCf);
+        assert_se(res == NULL);
+        assert_se(len == 0);
+
+        len = 5;
+        res = term_utf8_decode(&p, &len, 0x0);
+        assert_se(res != NULL);
+        assert_se(len == 2);
+}
+
+static void test_term_utf8_range(void) {
+        term_utf8 p = { };
+        const uint32_t *res;
+        char u8[4];
+        uint32_t i, j;
+        size_t ulen, len;
+
+        /* Convert all ucs-4 chars to utf-8 and back */
+
+        for (i = 0; i < 0x10FFFF; ++i) {
+                ulen = term_utf8_encode(u8, i);
+                if (!ulen)
+                        continue;
+
+                for (j = 0; j < ulen; ++j) {
+                        res = term_utf8_decode(&p, &len, u8[j]);
+                        if (!res) {
+                                assert_se(j + 1 != ulen);
+                                continue;
+                        }
+
+                        assert_se(j + 1 == ulen);
+                        assert_se(len == 1 && *res == i);
+                        assert_se(i <= 127 || ulen >= 2);
+                }
+        }
+}
+
+static void test_term_utf8_mix(void) {
+        static const char source[] = {
+                0x00,                           /* normal 0 */
+                0xC0, 0x80,                     /* overlong 0 */
+                0xC0, 0x81,                     /* overlong 1 */
+                0xE0, 0x80, 0x81,               /* overlong 1 */
+                0xF0, 0x80, 0x80, 0x81,         /* overlong 1 */
+                0xC0, 0x00,                     /* invalid continuation */
+                0xC0, 0xC0, 0x81,               /* invalid continuation with a following overlong 1 */
+                0xF8, 0x80, 0x80, 0x80, 0x81,   /* overlong 1 with 5 bytes */
+                0xE0, 0x80, 0xC0, 0x81,         /* invalid 3-byte followed by valid 2-byte */
+                0xF0, 0x80, 0x80, 0xC0, 0x81,   /* invalid 4-byte followed by valid 2-byte */
+        };
+        static const uint32_t result[] = {
+                0x0000,
+                0x0000,
+                0x0001,
+                0x0001,
+                0x0001,
+                0x00C0, 0x0000,
+                0x00C0, 0x0001,
+                0x00F8, 0x0080, 0x0080, 0x0080, 0x0081,
+                0x00E0, 0x0080, 0x0001,
+                0x00F0, 0x0080, 0x0080, 0x0001,
+        };
+        term_utf8 p = { };
+        const uint32_t *res;
+        unsigned int i, j;
+        size_t len;
+
+        for (i = 0, j = 0; i < sizeof(source); ++i) {
+                res = term_utf8_decode(&p, &len, source[i]);
+                if (!res)
+                        continue;
+
+                assert_se(j + len <= ELEMENTSOF(result));
+                assert_se(!memcmp(res, &result[j], sizeof(uint32_t) * len));
+                j += len;
+        }
+
+        assert_se(j == ELEMENTSOF(result));
+}
+
+int main(int argc, char *argv[]) {
+        test_term_utf8_invalid();
+        test_term_utf8_range();
+        test_term_utf8_mix();
+
+        return 0;
+}