locale: Introduce translate_unicode_codepoint into linereader.c

This will permit reusing the Unicode character processing for
different character encodings, not just the current <U...> encoding.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>
This commit is contained in:
Florian Weimer 2022-07-05 09:05:22 +02:00
parent 19d4944459
commit 7dcaabb94c
1 changed files with 85 additions and 82 deletions

View File

@ -596,6 +596,83 @@ get_ident (struct linereader *lr)
return &lr->token;
}
/* Process a decoded Unicode codepoint WCH in a string, placing the
multibyte sequence into LRB. Return false if the character is not
found in CHARMAP/REPERTOIRE. */
static bool
translate_unicode_codepoint (struct localedef_t *locale,
const struct charmap_t *charmap,
const struct repertoire_t *repertoire,
uint32_t wch, struct lr_buffer *lrb)
{
/* See whether the charmap contains the Uxxxxxxxx names. */
char utmp[10];
snprintf (utmp, sizeof (utmp), "U%08X", wch);
struct charseq *seq = charmap_find_value (charmap, utmp, 9);
if (seq == NULL)
{
/* No, this isn't the case. Now determine from
the repertoire the name of the character and
find it in the charmap. */
if (repertoire != NULL)
{
const char *symbol = repertoire_find_symbol (repertoire, wch);
if (symbol != NULL)
seq = charmap_find_value (charmap, symbol, strlen (symbol));
}
if (seq == NULL)
{
#ifndef NO_TRANSLITERATION
/* Transliterate if possible. */
if (locale != NULL)
{
if ((locale->avail & CTYPE_LOCALE) == 0)
{
/* Load the CTYPE data now. */
int old_needed = locale->needed;
locale->needed = 0;
locale = load_locale (LC_CTYPE, locale->name,
locale->repertoire_name,
charmap, locale);
locale->needed = old_needed;
}
uint32_t *translit;
if ((locale->avail & CTYPE_LOCALE) != 0
&& ((translit = find_translit (locale, charmap, wch))
!= NULL))
/* The CTYPE data contains a matching
transliteration. */
{
for (int i = 0; translit[i] != 0; ++i)
{
snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
seq = charmap_find_value (charmap, utmp, 9);
assert (seq != NULL);
adds (lrb, seq->bytes, seq->nbytes);
}
return true;
}
}
#endif /* NO_TRANSLITERATION */
/* Not a known name. */
return false;
}
}
if (seq != NULL)
{
adds (lrb, seq->bytes, seq->nbytes);
return true;
}
else
return false;
}
static struct token *
get_string (struct linereader *lr, const struct charmap_t *charmap,
@ -635,7 +712,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
}
else
{
int illegal_string = 0;
bool illegal_string = false;
size_t buf2act = 0;
size_t buf2max = 56 * sizeof (uint32_t);
int ch;
@ -695,7 +772,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
{
/* <> is no correct name. Ignore it and also signal an
error. */
illegal_string = 1;
illegal_string = true;
continue;
}
@ -709,8 +786,6 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
if (cp == &lrb.buf[lrb.act])
{
char utmp[10];
/* Yes, it is. */
addc (&lrb, '\0');
wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
@ -721,81 +796,9 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
if (return_widestr)
ADDWC (wch);
/* See whether the charmap contains the Uxxxxxxxx names. */
snprintf (utmp, sizeof (utmp), "U%08X", wch);
seq = charmap_find_value (charmap, utmp, 9);
if (seq == NULL)
{
/* No, this isn't the case. Now determine from
the repertoire the name of the character and
find it in the charmap. */
if (repertoire != NULL)
{
const char *symbol;
symbol = repertoire_find_symbol (repertoire, wch);
if (symbol != NULL)
seq = charmap_find_value (charmap, symbol,
strlen (symbol));
}
if (seq == NULL)
{
#ifndef NO_TRANSLITERATION
/* Transliterate if possible. */
if (locale != NULL)
{
uint32_t *translit;
if ((locale->avail & CTYPE_LOCALE) == 0)
{
/* Load the CTYPE data now. */
int old_needed = locale->needed;
locale->needed = 0;
locale = load_locale (LC_CTYPE,
locale->name,
locale->repertoire_name,
charmap, locale);
locale->needed = old_needed;
}
if ((locale->avail & CTYPE_LOCALE) != 0
&& ((translit = find_translit (locale,
charmap, wch))
!= NULL))
/* The CTYPE data contains a matching
transliteration. */
{
int i;
for (i = 0; translit[i] != 0; ++i)
{
char utmp[10];
snprintf (utmp, sizeof (utmp), "U%08X",
translit[i]);
seq = charmap_find_value (charmap, utmp,
9);
assert (seq != NULL);
adds (&lrb, seq->bytes, seq->nbytes);
}
continue;
}
}
#endif /* NO_TRANSLITERATION */
/* Not a known name. */
illegal_string = 1;
}
}
if (seq != NULL)
adds (&lrb, seq->bytes, seq->nbytes);
if (!translate_unicode_codepoint (locale, charmap,
repertoire, wch, &lrb))
illegal_string = true;
continue;
}
}
@ -812,7 +815,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
/* This name is not in the charmap. */
lr_error (lr, _("symbol `%.*s' not in charmap"),
(int) (lrb.act - startidx), &lrb.buf[startidx]);
illegal_string = 1;
illegal_string = true;
}
if (return_widestr)
@ -833,7 +836,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
/* This name is not in the repertoire map. */
lr_error (lr, _("symbol `%.*s' not in repertoire map"),
(int) (lrb.act - startidx), &lrb.buf[startidx]);
illegal_string = 1;
illegal_string = true;
}
else
ADDWC (wch);
@ -850,7 +853,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
if (ch == '\n' || ch == EOF)
{
lr_error (lr, _("unterminated string"));
illegal_string = 1;
illegal_string = true;
}
if (illegal_string)