LibUnicode: Work around ICU bug over-canonicalizing "yes" keyword values

ICU will canonicalize "yes" to "true" in all Unicode keywords. This is a
bug - only some keywords should undergo this change. It will then change
"true" to the empty string. This is a known ICU bug, which we can avoid
for now by detecting which keywords should retain their "yes" value.
This commit is contained in:
Timothy Flynn
2026-03-13 14:38:56 -04:00
committed by Tim Flynn
parent 776134ce03
commit f22710abba
Notes: github-actions[bot] 2026-03-14 12:17:56 +00:00
4 changed files with 75 additions and 24 deletions

View File

@@ -8,6 +8,7 @@
#include <AK/NonnullOwnPtr.h>
#include <AK/Utf16View.h>
#include <LibUnicode/ICU.h>
#include <LibUnicode/Locale.h>
#include <unicode/dtptngen.h>
#include <unicode/locdspnm.h>
@@ -45,18 +46,65 @@ LocaleData::LocaleData(icu::Locale locale)
{
}
String LocaleData::to_string()
String LocaleData::canonicalize(StringView locale)
{
if (!m_locale_string.has_value()) {
UErrorCode status = U_ZERO_ERROR;
auto locale_data = LocaleData::for_locale(locale);
VERIFY(locale_data.has_value());
auto result = locale().toLanguageTag<StringBuilder>(status);
verify_icu_success(status);
if (locale_data->m_canonical_locale_string.has_value())
return *locale_data->m_canonical_locale_string;
m_locale_string = MUST(result.to_string());
UErrorCode status = U_ZERO_ERROR;
// FIXME: ICU's canonicalize() and toLanguageTag() incorrectly convert the Unicode extension value "yes" to "true"
// for all keywords (and then remove "true" per UTS 35). Per CLDR BCP47 data, only specific keys define "yes"
// as an alias for "true" (kb, kc, kh, kk, kn). For other keys, "yes" must be preserved. See:
// https://unicode-org.atlassian.net/browse/ICU-21367
HashTable<ByteString> keywords_with_yes;
if (auto parsed = parse_unicode_locale_id(locale); parsed.has_value()) {
parsed->for_each_extension_of_type<LocaleExtension>([&](auto const& extension) {
for (auto const& keyword : extension.keywords) {
if (!keyword.value.equals_ignoring_ascii_case("yes"sv))
continue;
auto key = keyword.key.to_ascii_lowercase().to_byte_string();
if (auto const* legacy_key = uloc_toLegacyKey(key.characters())) {
if (auto const* value = uloc_toUnicodeLocaleType(legacy_key, "yes"); !value || value != "true"sv)
keywords_with_yes.set(move(key));
}
}
return IterationDecision::Continue;
});
}
return *m_locale_string;
locale_data->locale().canonicalize(status);
verify_icu_success(status);
auto result = locale_data->locale().toLanguageTag<StringBuilder>(status);
verify_icu_success(status);
if (keywords_with_yes.is_empty()) {
locale_data->m_canonical_locale_string = MUST(result.to_string());
} else {
auto parsed = parse_unicode_locale_id(result.string_view());
VERIFY(parsed.has_value());
parsed->for_each_extension_of_type<LocaleExtension>([&](auto& extension) {
for (auto& keyword : extension.keywords) {
if (keyword.value.is_empty() && keywords_with_yes.contains(keyword.key.bytes_as_string_view()))
keyword.value = "yes"_string;
}
return IterationDecision::Continue;
});
locale_data->m_canonical_locale_string = parsed->to_string();
}
return *locale_data->m_canonical_locale_string;
}
icu::LocaleDisplayNames& LocaleData::standard_display_names()

View File

@@ -37,11 +37,10 @@ namespace Unicode {
class LocaleData {
public:
static Optional<LocaleData&> for_locale(StringView locale);
static String canonicalize(StringView locale);
ALWAYS_INLINE icu::Locale& locale() { return m_locale; }
String to_string();
icu::LocaleDisplayNames& standard_display_names();
icu::LocaleDisplayNames& dialect_display_names();
@@ -58,7 +57,7 @@ private:
explicit LocaleData(icu::Locale locale);
icu::Locale m_locale;
Optional<String> m_locale_string;
Optional<String> m_canonical_locale_string;
OwnPtr<icu::LocaleDisplayNames> m_standard_display_names;
OwnPtr<icu::LocaleDisplayNames> m_dialect_display_names;

View File

@@ -478,15 +478,7 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale)
String canonicalize_unicode_locale_id(StringView locale)
{
UErrorCode status = U_ZERO_ERROR;
auto locale_data = LocaleData::for_locale(locale);
VERIFY(locale_data.has_value());
locale_data->locale().canonicalize(status);
verify_icu_success(status);
return locale_data->to_string();
return LocaleData::canonicalize(locale);
}
String canonicalize_unicode_extension_values(StringView key, StringView value)
@@ -499,9 +491,6 @@ String canonicalize_unicode_extension_values(StringView key, StringView value)
auto locale = builder.build(status);
verify_icu_success(status);
locale.canonicalize(status);
verify_icu_success(status);
auto result = locale.getUnicodeKeywordValue<StringBuilder>(icu_string_piece(key), status);
verify_icu_success(status);

View File

@@ -366,12 +366,27 @@ TEST_CASE(canonicalize_unicode_locale_id)
test("EN-U-1K-TRUE"sv, "en-u-1k"sv);
test("en-u-1k-true-abcd"sv, "en-u-1k-true-abcd"sv);
test("EN-U-1K-TRUE-ABCD"sv, "en-u-1k-true-abcd"sv);
// Keys for which "yes" is alased to "true" in the CLDR, which is then removed.
test("en-u-kb-yes"sv, "en-u-kb"sv);
test("EN-U-KB-YES"sv, "en-u-kb"sv);
test("en-u-kc-yes"sv, "en-u-kc"sv);
test("en-u-kh-yes"sv, "en-u-kh"sv);
test("en-u-kk-yes"sv, "en-u-kk"sv);
test("en-u-kn-yes"sv, "en-u-kn"sv);
// Keys for which "yes" is not alased to "true" in the CLDR, which is then preserved.
test("en-u-ka-yes"sv, "en-u-ka-yes"sv);
test("EN-U-KA-YES"sv, "en-u-ka-yes"sv);
test("en-u-kf-yes"sv, "en-u-kf-yes"sv);
test("en-u-kr-yes"sv, "en-u-kr-yes"sv);
test("en-u-ks-yes"sv, "en-u-ks-yes"sv);
test("en-u-kv-yes"sv, "en-u-kv-yes"sv);
test("en-u-ka-yes-kb-yes"sv, "en-u-ka-yes-kb"sv);
test("en-u-kb-yes-abcd"sv, "en-u-kb-yes-abcd"sv);
test("EN-U-KB-YES-ABCD"sv, "en-u-kb-yes-abcd"sv);
test("en-u-ka-yes"sv, "en-u-ka"sv);
test("EN-U-KA-YES"sv, "en-u-ka"sv);
test("en-u-1k-names"sv, "en-u-1k-names"sv);
test("EN-U-1K-NAMES"sv, "en-u-1k-names"sv);
test("en-u-ks-primary"sv, "en-u-ks-level1"sv);