mirror of
https://github.com/LadybirdBrowser/ladybird
synced 2026-04-25 17:25:08 +02:00
LibUnicode: Work around ICU bug over-canonicalizing "yes" keyword values
ICU will canonicalize "yes" to "true" in all Unicode keywords. This is a bug - only some keywords should undergo this change. It will then change "true" to the empty string. This is a known ICU bug, which we can avoid for now by detecting which keywords should retain their "yes" value.
This commit is contained in:
Notes:
github-actions[bot]
2026-03-14 12:17:56 +00:00
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/f22710abbaf Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/8405
@@ -8,6 +8,7 @@
|
||||
#include <AK/NonnullOwnPtr.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <LibUnicode/ICU.h>
|
||||
#include <LibUnicode/Locale.h>
|
||||
|
||||
#include <unicode/dtptngen.h>
|
||||
#include <unicode/locdspnm.h>
|
||||
@@ -45,18 +46,65 @@ LocaleData::LocaleData(icu::Locale locale)
|
||||
{
|
||||
}
|
||||
|
||||
String LocaleData::to_string()
|
||||
String LocaleData::canonicalize(StringView locale)
|
||||
{
|
||||
if (!m_locale_string.has_value()) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
auto locale_data = LocaleData::for_locale(locale);
|
||||
VERIFY(locale_data.has_value());
|
||||
|
||||
auto result = locale().toLanguageTag<StringBuilder>(status);
|
||||
verify_icu_success(status);
|
||||
if (locale_data->m_canonical_locale_string.has_value())
|
||||
return *locale_data->m_canonical_locale_string;
|
||||
|
||||
m_locale_string = MUST(result.to_string());
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
// FIXME: ICU's canonicalize() and toLanguageTag() incorrectly convert the Unicode extension value "yes" to "true"
|
||||
// for all keywords (and then remove "true" per UTS 35). Per CLDR BCP47 data, only specific keys define "yes"
|
||||
// as an alias for "true" (kb, kc, kh, kk, kn). For other keys, "yes" must be preserved. See:
|
||||
// https://unicode-org.atlassian.net/browse/ICU-21367
|
||||
HashTable<ByteString> keywords_with_yes;
|
||||
|
||||
if (auto parsed = parse_unicode_locale_id(locale); parsed.has_value()) {
|
||||
parsed->for_each_extension_of_type<LocaleExtension>([&](auto const& extension) {
|
||||
for (auto const& keyword : extension.keywords) {
|
||||
if (!keyword.value.equals_ignoring_ascii_case("yes"sv))
|
||||
continue;
|
||||
|
||||
auto key = keyword.key.to_ascii_lowercase().to_byte_string();
|
||||
|
||||
if (auto const* legacy_key = uloc_toLegacyKey(key.characters())) {
|
||||
if (auto const* value = uloc_toUnicodeLocaleType(legacy_key, "yes"); !value || value != "true"sv)
|
||||
keywords_with_yes.set(move(key));
|
||||
}
|
||||
}
|
||||
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
}
|
||||
|
||||
return *m_locale_string;
|
||||
locale_data->locale().canonicalize(status);
|
||||
verify_icu_success(status);
|
||||
|
||||
auto result = locale_data->locale().toLanguageTag<StringBuilder>(status);
|
||||
verify_icu_success(status);
|
||||
|
||||
if (keywords_with_yes.is_empty()) {
|
||||
locale_data->m_canonical_locale_string = MUST(result.to_string());
|
||||
} else {
|
||||
auto parsed = parse_unicode_locale_id(result.string_view());
|
||||
VERIFY(parsed.has_value());
|
||||
|
||||
parsed->for_each_extension_of_type<LocaleExtension>([&](auto& extension) {
|
||||
for (auto& keyword : extension.keywords) {
|
||||
if (keyword.value.is_empty() && keywords_with_yes.contains(keyword.key.bytes_as_string_view()))
|
||||
keyword.value = "yes"_string;
|
||||
}
|
||||
|
||||
return IterationDecision::Continue;
|
||||
});
|
||||
|
||||
locale_data->m_canonical_locale_string = parsed->to_string();
|
||||
}
|
||||
|
||||
return *locale_data->m_canonical_locale_string;
|
||||
}
|
||||
|
||||
icu::LocaleDisplayNames& LocaleData::standard_display_names()
|
||||
|
||||
@@ -37,11 +37,10 @@ namespace Unicode {
|
||||
class LocaleData {
|
||||
public:
|
||||
static Optional<LocaleData&> for_locale(StringView locale);
|
||||
static String canonicalize(StringView locale);
|
||||
|
||||
ALWAYS_INLINE icu::Locale& locale() { return m_locale; }
|
||||
|
||||
String to_string();
|
||||
|
||||
icu::LocaleDisplayNames& standard_display_names();
|
||||
icu::LocaleDisplayNames& dialect_display_names();
|
||||
|
||||
@@ -58,7 +57,7 @@ private:
|
||||
explicit LocaleData(icu::Locale locale);
|
||||
|
||||
icu::Locale m_locale;
|
||||
Optional<String> m_locale_string;
|
||||
Optional<String> m_canonical_locale_string;
|
||||
|
||||
OwnPtr<icu::LocaleDisplayNames> m_standard_display_names;
|
||||
OwnPtr<icu::LocaleDisplayNames> m_dialect_display_names;
|
||||
|
||||
@@ -478,15 +478,7 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale)
|
||||
|
||||
String canonicalize_unicode_locale_id(StringView locale)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
auto locale_data = LocaleData::for_locale(locale);
|
||||
VERIFY(locale_data.has_value());
|
||||
|
||||
locale_data->locale().canonicalize(status);
|
||||
verify_icu_success(status);
|
||||
|
||||
return locale_data->to_string();
|
||||
return LocaleData::canonicalize(locale);
|
||||
}
|
||||
|
||||
String canonicalize_unicode_extension_values(StringView key, StringView value)
|
||||
@@ -499,9 +491,6 @@ String canonicalize_unicode_extension_values(StringView key, StringView value)
|
||||
auto locale = builder.build(status);
|
||||
verify_icu_success(status);
|
||||
|
||||
locale.canonicalize(status);
|
||||
verify_icu_success(status);
|
||||
|
||||
auto result = locale.getUnicodeKeywordValue<StringBuilder>(icu_string_piece(key), status);
|
||||
verify_icu_success(status);
|
||||
|
||||
|
||||
@@ -366,12 +366,27 @@ TEST_CASE(canonicalize_unicode_locale_id)
|
||||
test("EN-U-1K-TRUE"sv, "en-u-1k"sv);
|
||||
test("en-u-1k-true-abcd"sv, "en-u-1k-true-abcd"sv);
|
||||
test("EN-U-1K-TRUE-ABCD"sv, "en-u-1k-true-abcd"sv);
|
||||
|
||||
// Keys for which "yes" is alased to "true" in the CLDR, which is then removed.
|
||||
test("en-u-kb-yes"sv, "en-u-kb"sv);
|
||||
test("EN-U-KB-YES"sv, "en-u-kb"sv);
|
||||
test("en-u-kc-yes"sv, "en-u-kc"sv);
|
||||
test("en-u-kh-yes"sv, "en-u-kh"sv);
|
||||
test("en-u-kk-yes"sv, "en-u-kk"sv);
|
||||
test("en-u-kn-yes"sv, "en-u-kn"sv);
|
||||
|
||||
// Keys for which "yes" is not alased to "true" in the CLDR, which is then preserved.
|
||||
test("en-u-ka-yes"sv, "en-u-ka-yes"sv);
|
||||
test("EN-U-KA-YES"sv, "en-u-ka-yes"sv);
|
||||
test("en-u-kf-yes"sv, "en-u-kf-yes"sv);
|
||||
test("en-u-kr-yes"sv, "en-u-kr-yes"sv);
|
||||
test("en-u-ks-yes"sv, "en-u-ks-yes"sv);
|
||||
test("en-u-kv-yes"sv, "en-u-kv-yes"sv);
|
||||
|
||||
test("en-u-ka-yes-kb-yes"sv, "en-u-ka-yes-kb"sv);
|
||||
test("en-u-kb-yes-abcd"sv, "en-u-kb-yes-abcd"sv);
|
||||
test("EN-U-KB-YES-ABCD"sv, "en-u-kb-yes-abcd"sv);
|
||||
test("en-u-ka-yes"sv, "en-u-ka"sv);
|
||||
test("EN-U-KA-YES"sv, "en-u-ka"sv);
|
||||
|
||||
test("en-u-1k-names"sv, "en-u-1k-names"sv);
|
||||
test("EN-U-1K-NAMES"sv, "en-u-1k-names"sv);
|
||||
test("en-u-ks-primary"sv, "en-u-ks-level1"sv);
|
||||
|
||||
Reference in New Issue
Block a user