strcoll: Remove incorrect STRDIFF-based optimization (Bug 18589).

The optimization introduced in commit
f13c2a8dff, causes regressions in
sorting for languages that have digraphs that change sort order, like
cs_CZ which sorts ch between h and i.

My analysis shows the fast-forwarding optimization in STRCOLL advances
through a digraph while possibly stopping in the middle which results
in a subsequent skipping of the digraph and incorrect sorting. The
optimization is incorrect as implemented and because of that I'm
removing it for 2.23, and I will also commit this fix for 2.22 where
it was originally introduced.

This patch reverts the optimization, introduces a new bug-strcoll2.c
regression test that tests both cs_CZ.UTF-8 and da_DK.ISO-8859-1 and
ensures they sort one digraph each correctly. The optimization can't be
applied without regressing this test.

Checked on x86_64, bug-strcoll2.c fails without this patch and passes
after. This will also get a fix on 2.22 which has the same bug.
This commit is contained in:
Carlos O'Donell 2015-10-08 16:34:53 -04:00
parent fd91891a50
commit 87701a58e2
10 changed files with 116 additions and 67 deletions

View file

@ -1,3 +1,17 @@
2015-10-08 Carlos O'Donell <carlos@redhat.com>
[BZ #18589]
* string/bug-strcoll2.c: New file.
* locale/categories.def: Revert commit
f13c2a8dff2329c6692a80176262ceaaf8a6f74e.
* locale/langinfo.h: Likewise.
* locale/localeinfo.h: Likewise.
* locale/C-collate.c: Likewise.
* programs/ld-collate.c (collate_output): Likewise.
* string/strcoll_l.c (STRDIFF): Likewise.
(STRCOLL): Likewise.
* wcsmbs/wcscoll_l.c: Likewise.
2015-10-08 Joseph Myers <joseph@codesourcery.com>
* math/libm-test.inc (lround_test_data): Do not expect the absence

14
NEWS
View file

@ -13,13 +13,13 @@ Version 2.23
15384, 15786, 15918, 16141, 16296, 16347, 16415, 16517, 16519, 16520,
16521, 16620, 16734, 16973, 16985, 17118, 17243, 17244, 17250, 17441,
17787, 17886, 17887, 17905, 18084, 18086, 18240, 18265, 18370, 18421,
18480, 18525, 18595, 18610, 18618, 18647, 18661, 18674, 18675, 18681,
18724, 18757, 18778, 18781, 18787, 18789, 18790, 18795, 18796, 18803,
18820, 18823, 18824, 18825, 18857, 18863, 18870, 18872, 18873, 18875,
18887, 18921, 18951, 18952, 18956, 18961, 18966, 18967, 18969, 18970,
18977, 18980, 18981, 18985, 19003, 19012, 19016, 19018, 19032, 19046,
19049, 19050, 19059, 19071, 19076, 19077, 19078, 19079, 19085, 19086,
19088.
18480, 18525, 18595, 18589, 18610, 18618, 18647, 18661, 18674, 18675,
18681, 18724, 18757, 18778, 18781, 18787, 18789, 18790, 18795, 18796,
18803, 18820, 18823, 18824, 18825, 18857, 18863, 18870, 18872, 18873,
18875, 18887, 18921, 18951, 18952, 18956, 18961, 18966, 18967, 18969,
18970, 18977, 18980, 18981, 18985, 19003, 19012, 19016, 19018, 19032,
19046, 19049, 19050, 19059, 19071, 19076, 19077, 19078, 19079, 19085,
19086, 19088.
* The obsolete header <regexp.h> has been removed. Programs that require
this header must be updated to use <regex.h> instead.

View file

@ -144,8 +144,6 @@ const struct __locale_data _nl_C_LC_COLLATE attribute_hidden =
/* _NL_COLLATE_COLLSEQWC */
{ .string = (const char *) collseqwc },
/* _NL_COLLATE_CODESET */
{ .string = _nl_C_codeset },
/* _NL_COLLATE_ENCODING_TYPE */
{ .word = __cet_8bit }
{ .string = _nl_C_codeset }
}
};

View file

@ -58,7 +58,6 @@ DEFINE_CATEGORY
DEFINE_ELEMENT (_NL_COLLATE_COLLSEQMB, "collate-collseqmb", std, wstring)
DEFINE_ELEMENT (_NL_COLLATE_COLLSEQWC, "collate-collseqwc", std, wstring)
DEFINE_ELEMENT (_NL_COLLATE_CODESET, "collate-codeset", std, string)
DEFINE_ELEMENT (_NL_COLLATE_ENCODING_TYPE, "collate-encoding-type", std, word)
), NO_POSTLOAD)

View file

@ -255,7 +255,6 @@ enum
_NL_COLLATE_COLLSEQMB,
_NL_COLLATE_COLLSEQWC,
_NL_COLLATE_CODESET,
_NL_COLLATE_ENCODING_TYPE,
_NL_NUM_LC_COLLATE,
/* LC_CTYPE category: character classification.

View file

@ -110,14 +110,6 @@ enum coll_sort_rule
sort_mask
};
/* Collation encoding type. */
enum collation_encoding_type
{
__cet_other,
__cet_8bit,
__cet_utf8
};
/* We can map the types of the entries into a few categories. */
enum value_type
{

View file

@ -32,7 +32,6 @@
#include "linereader.h"
#include "locfile.h"
#include "elem-hash.h"
#include "../localeinfo.h"
/* Uncomment the following line in the production version. */
/* #define NDEBUG 1 */
@ -2131,8 +2130,6 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
/* The words have to be handled specially. */
if (idx == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
add_locale_uint32 (&file, 0);
else if (idx == _NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE))
add_locale_uint32 (&file, __cet_other);
else
add_locale_empty (&file);
}
@ -2496,12 +2493,6 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
add_locale_raw_data (&file, collate->mbseqorder, 256);
add_locale_collseq_table (&file, &collate->wcseqorder);
add_locale_string (&file, charmap->code_set_name);
if (strcmp (charmap->code_set_name, "UTF-8") == 0)
add_locale_uint32 (&file, __cet_utf8);
else if (charmap->mb_cur_max == 1)
add_locale_uint32 (&file, __cet_8bit);
else
add_locale_uint32 (&file, __cet_other);
write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file);
obstack_free (&weightpool, NULL);

93
string/bug-strcoll2.c Normal file
View file

@ -0,0 +1,93 @@
/* Bug 18589: sort-test.sh fails at random.
Copyright (C) 1998-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <stdio.h>
#include <string.h>
#include <locale.h>
/* An incorrect strcoll optimization resulted in incorrect
results from strcoll for cs_CZ and da_DK. */
int
test_cs_CZ (void)
{
const char t1[] = "config";
const char t2[] = "choose";
if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL)
{
perror ("setlocale");
return 1;
}
/* In Czech the digraph ch sorts after c, therefore we expect
config to sort before choose. */
int a = strcoll (t1, t2);
int b = strcoll (t2, t1);
printf ("strcoll (\"%s\", \"%s\") = %d\n", t1, t2, a);
printf ("strcoll (\"%s\", \"%s\") = %d\n", t2, t1, b);
if (a < 0 && b > 0)
{
puts ("PASS: config < choose");
return 0;
}
else
{
puts ("FAIL: Wrong sorting in cz_CZ.UTF-8.");
return 1;
}
}
int
test_da_DK (void)
{
const char t1[] = "AS";
const char t2[] = "AA";
if (setlocale (LC_ALL, "da_DK.ISO-8859-1") == NULL)
{
perror ("setlocale");
return 1;
}
/* AA should be treated as the last letter of the Danish alphabet,
hence sorting after AS. */
int a = strcoll (t1, t2);
int b = strcoll (t2, t1);
printf ("strcoll (\"%s\", \"%s\") = %d\n", t1, t2, a);
printf ("strcoll (\"%s\", \"%s\") = %d\n", t2, t1, b);
if (a < 0 && b > 0)
{
puts ("PASS: AS < AA");
return 0;
}
else
{
puts ("FAIL: Wrong sorting in da_DK.ISO-8859-1");
return 1;
}
}
static int
do_test (void)
{
int err = 0;
err |= test_cs_CZ ();
err |= test_da_DK ();
return err;
}
#define TEST_FUNCTION do_test ()
#include "../test-skeleton.c"

View file

@ -29,7 +29,6 @@
# define STRING_TYPE char
# define USTRING_TYPE unsigned char
# define STRCOLL __strcoll_l
# define STRDIFF __strdiff
# define STRCMP strcmp
# define WEIGHT_H "../locale/weight.h"
# define SUFFIX MB
@ -42,20 +41,6 @@
#include "../locale/localeinfo.h"
#include WEIGHT_H
#define MASK_UTF8_7BIT (1 << 7)
#define MASK_UTF8_START (3 << 6)
size_t
STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
{
size_t n;
for (n = 0; *s != '\0' && *s++ == *t++; ++n)
continue;
return n;
}
/* Track status while looking for sequences in a string. */
typedef struct
{
@ -269,29 +254,9 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
const USTRING_TYPE *extra;
const int32_t *indirect;
/* In case there is no locale specific sort order (C / POSIX). */
if (nrules == 0)
return STRCMP (s1, s2);
/* Fast forward to the position of the first difference. Needs to be
encoding aware as the byte-by-byte comparison can stop in the middle
of a char sequence for multibyte encodings like UTF-8. */
uint_fast32_t encoding =
current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
if (encoding != __cet_other)
{
size_t diff = STRDIFF (s1, s2);
if (diff > 0)
{
if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
do
diff--;
while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
s1 += diff;
s2 += diff;
}
}
/* Catch empty strings. */
if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
return (*s1 != '\0') - (*s2 != '\0');
@ -358,8 +323,7 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
byte-level comparison to ensure that we don't waste time
going through multiple passes for totally equal strings
before proceeding to subsequent passes. */
if (pass == 0 && encoding == __cet_other &&
STRCMP (s1, s2) == 0)
if (pass == 0 && STRCMP (s1, s2) == 0)
return result;
else
break;

View file

@ -23,7 +23,6 @@
#define STRING_TYPE wchar_t
#define USTRING_TYPE wint_t
#define STRCOLL __wcscoll_l
#define STRDIFF __wcsdiff
#define STRCMP __wcscmp
#define WEIGHT_H "../locale/weightwc.h"
#define SUFFIX WC