Source code

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "mozilla/intl/Locale.h"
#include "mozilla/Assertions.h"
#include "mozilla/DebugOnly.h"
#include "mozilla/MathAlgorithms.h"
#include "mozilla/Span.h"
#include "mozilla/TextUtils.h"
#include "mozilla/Variant.h"
#include "ICU4CGlue.h"
#include <algorithm>
#include <iterator>
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <string.h>
#include <type_traits>
#include <utility>
#include "unicode/uloc.h"
#include "unicode/utypes.h"
namespace mozilla::intl {
using namespace intl::LanguageTagLimits;
template <typename CharT>
bool IsStructurallyValidLanguageTag(Span<const CharT> aLanguage) {
// unicode_language_subtag = alpha{2,3} | alpha{5,8};
size_t length = aLanguage.size();
const CharT* str = aLanguage.data();
return ((2 <= length && length <= 3) || (5 <= length && length <= 8)) &&
std::all_of(str, str + length, IsAsciiAlpha<CharT>);
}
template bool IsStructurallyValidLanguageTag(Span<const char> aLanguage);
template bool IsStructurallyValidLanguageTag(Span<const Latin1Char> aLanguage);
template bool IsStructurallyValidLanguageTag(Span<const char16_t> aLanguage);
template <typename CharT>
bool IsStructurallyValidScriptTag(Span<const CharT> aScript) {
// unicode_script_subtag = alpha{4} ;
size_t length = aScript.size();
const CharT* str = aScript.data();
return length == 4 && std::all_of(str, str + length, IsAsciiAlpha<CharT>);
}
template bool IsStructurallyValidScriptTag(Span<const char> aScript);
template bool IsStructurallyValidScriptTag(Span<const Latin1Char> aScript);
template bool IsStructurallyValidScriptTag(Span<const char16_t> aScript);
template <typename CharT>
bool IsStructurallyValidRegionTag(Span<const CharT> aRegion) {
// unicode_region_subtag = (alpha{2} | digit{3}) ;
size_t length = aRegion.size();
const CharT* str = aRegion.data();
return (length == 2 && std::all_of(str, str + length, IsAsciiAlpha<CharT>)) ||
(length == 3 && std::all_of(str, str + length, IsAsciiDigit<CharT>));
}
template bool IsStructurallyValidRegionTag(Span<const char> aRegion);
template bool IsStructurallyValidRegionTag(Span<const Latin1Char> aRegion);
template bool IsStructurallyValidRegionTag(Span<const char16_t> aRegion);
#ifdef DEBUG
bool IsStructurallyValidVariantTag(Span<const char> aVariant) {
// unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
size_t length = aVariant.size();
const char* str = aVariant.data();
return ((5 <= length && length <= 8) ||
(length == 4 && IsAsciiDigit(str[0]))) &&
std::all_of(str, str + length, IsAsciiAlphanumeric<char>);
}
bool IsStructurallyValidUnicodeExtensionTag(Span<const char> aExtension) {
return LocaleParser::CanParseUnicodeExtension(aExtension).isOk();
}
static bool IsStructurallyValidExtensionTag(Span<const char> aExtension) {
// other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
// NB: Allow any extension, including Unicode and Transform here, because
// this function is only used for an assertion.
size_t length = aExtension.size();
const char* str = aExtension.data();
const char* const end = aExtension.data() + length;
if (length <= 2) {
return false;
}
if (!IsAsciiAlphanumeric(str[0]) || str[0] == 'x' || str[0] == 'X') {
return false;
}
str++;
if (*str++ != '-') {
return false;
}
while (true) {
const char* sep =
reinterpret_cast<const char*>(memchr(str, '-', end - str));
size_t len = (sep ? sep : end) - str;
if (len < 2 || len > 8 ||
!std::all_of(str, str + len, IsAsciiAlphanumeric<char>)) {
return false;
}
if (!sep) {
return true;
}
str = sep + 1;
}
}
bool IsStructurallyValidPrivateUseTag(Span<const char> aPrivateUse) {
// pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
size_t length = aPrivateUse.size();
const char* str = aPrivateUse.data();
const char* const end = aPrivateUse.data() + length;
if (length <= 2) {
return false;
}
if (str[0] != 'x' && str[0] != 'X') {
return false;
}
str++;
if (*str++ != '-') {
return false;
}
while (true) {
const char* sep =
reinterpret_cast<const char*>(memchr(str, '-', end - str));
size_t len = (sep ? sep : end) - str;
if (len == 0 || len > 8 ||
!std::all_of(str, str + len, IsAsciiAlphanumeric<char>)) {
return false;
}
if (!sep) {
return true;
}
str = sep + 1;
}
}
#endif
ptrdiff_t Locale::UnicodeExtensionIndex() const {
// The extension subtags aren't necessarily sorted, so we can't use binary
// search here.
auto p = std::find_if(
mExtensions.begin(), mExtensions.end(),
[](const auto& ext) { return ext[0] == 'u' || ext[0] == 'U'; });
if (p != mExtensions.end()) {
return std::distance(mExtensions.begin(), p);
}
return -1;
}
Maybe<Span<const char>> Locale::GetUnicodeExtension() const {
ptrdiff_t index = UnicodeExtensionIndex();
if (index >= 0) {
return Some(MakeStringSpan(mExtensions[index].get()));
}
return Nothing();
}
ICUResult Locale::SetUnicodeExtension(Span<const char> aExtension) {
MOZ_ASSERT(IsStructurallyValidUnicodeExtensionTag(aExtension));
auto duplicated = DuplicateStringToUniqueChars(aExtension);
// Replace the existing Unicode extension subtag or append a new one.
ptrdiff_t index = UnicodeExtensionIndex();
if (index >= 0) {
mExtensions[index] = std::move(duplicated);
return Ok();
}
if (!mExtensions.append(std::move(duplicated))) {
return Err(ICUError::OutOfMemory);
}
return Ok();
}
void Locale::ClearUnicodeExtension() {
ptrdiff_t index = UnicodeExtensionIndex();
if (index >= 0) {
mExtensions.erase(mExtensions.begin() + index);
}
}
template <size_t InitialCapacity>
static bool SortAlphabetically(Vector<UniqueChars, InitialCapacity>& aSubtags) {
size_t length = aSubtags.length();
// Zero or one element lists are already sorted.
if (length < 2) {
return true;
}
// Handle two element lists inline.
if (length == 2) {
if (strcmp(aSubtags[0].get(), aSubtags[1].get()) > 0) {
aSubtags[0].swap(aSubtags[1]);
}
return true;
}
Vector<char*, 8> scratch;
if (!scratch.resizeUninitialized(length)) {
return false;
}
for (size_t i = 0; i < length; i++) {
scratch[i] = aSubtags[i].release();
}
std::stable_sort(
scratch.begin(), scratch.end(),
[](const char* a, const char* b) { return strcmp(a, b) < 0; });
for (size_t i = 0; i < length; i++) {
aSubtags[i] = UniqueChars(scratch[i]);
}
return true;
}
Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeBaseName() {
// Per 6.2.3 CanonicalizeUnicodeLocaleId, the very first step is to
// canonicalize the syntax by normalizing the case and ordering all subtags.
// The canonical syntax form is specified in UTS 35, 3.2.1.
// Language codes need to be in lower case. "JA" -> "ja"
mLanguage.ToLowerCase();
MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span()));
// The first character of a script code needs to be capitalized.
// "hans" -> "Hans"
mScript.ToTitleCase();
MOZ_ASSERT(Script().Missing() ||
IsStructurallyValidScriptTag(Script().Span()));
// Region codes need to be in upper case. "bu" -> "BU"
mRegion.ToUpperCase();
MOZ_ASSERT(Region().Missing() ||
IsStructurallyValidRegionTag(Region().Span()));
// The canonical case for variant subtags is lowercase.
for (UniqueChars& variant : mVariants) {
char* variantChars = variant.get();
size_t variantLength = strlen(variantChars);
AsciiToLowerCase(variantChars, variantLength, variantChars);
MOZ_ASSERT(IsStructurallyValidVariantTag({variantChars, variantLength}));
}
// Extensions and privateuse subtags are case normalized in the
// |canonicalizeExtensions| method.
// The second step in UTS 35, 3.2.1, is to order all subtags.
if (mVariants.length() > 1) {
// 1. Any variants are in alphabetical order.
if (!SortAlphabetically(mVariants)) {
return Err(CanonicalizationError::OutOfMemory);
}
// Reject the Locale identifier if a duplicate variant was found, e.g.
// "en-variant-Variant".
const UniqueChars* duplicate = std::adjacent_find(
mVariants.begin(), mVariants.end(), [](const auto& a, const auto& b) {
return strcmp(a.get(), b.get()) == 0;
});
if (duplicate != mVariants.end()) {
return Err(CanonicalizationError::DuplicateVariant);
}
}
// 2. Any extensions are in alphabetical order by their singleton.
// 3. All attributes are sorted in alphabetical order.
// 4. All keywords and tfields are sorted by alphabetical order of their keys,
// within their respective extensions.
// 5. Any type or tfield value "true" is removed.
// - A subsequent call to canonicalizeExtensions() will perform these steps.
// 6.2.3 CanonicalizeUnicodeLocaleId, step 2 transforms the locale identifier
// into its canonical form per UTS 3.2.1.
// 1. Use the bcp47 data to replace keys, types, tfields, and tvalues by their
// canonical forms.
// - A subsequent call to canonicalizeExtensions() will perform this step.
// 2. Replace aliases in the unicode_language_id and tlang (if any).
// - tlang is handled in canonicalizeExtensions().
// Replace deprecated language, region, and variant subtags with their
// preferred mappings.
if (!UpdateLegacyMappings()) {
return Err(CanonicalizationError::OutOfMemory);
}
// Replace deprecated language subtags with their preferred values.
if (!LanguageMapping(mLanguage) && ComplexLanguageMapping(mLanguage)) {
PerformComplexLanguageMappings();
}
// Replace deprecated script subtags with their preferred values.
if (Script().Present()) {
ScriptMapping(mScript);
}
// Replace deprecated region subtags with their preferred values.
if (Region().Present()) {
if (!RegionMapping(mRegion) && ComplexRegionMapping(mRegion)) {
PerformComplexRegionMappings();
}
}
// Replace deprecated variant subtags with their preferred values.
if (!PerformVariantMappings()) {
return Err(CanonicalizationError::OutOfMemory);
}
// No extension replacements are currently present.
// Private use sequences are left as is.
// 3. Replace aliases in special key values.
// - A subsequent call to canonicalizeExtensions() will perform this step.
return Ok();
}
#ifdef DEBUG
static bool IsAsciiLowercaseAlphanumericOrDash(Span<const char> aSpan) {
const char* ptr = aSpan.data();
size_t length = aSpan.size();
return std::all_of(ptr, ptr + length, [](auto c) {
return IsAsciiLowercaseAlpha(c) || IsAsciiDigit(c) || c == '-';
});
}
#endif
Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeExtensions() {
// The canonical case for all extension subtags is lowercase.
for (UniqueChars& extension : mExtensions) {
char* extensionChars = extension.get();
size_t extensionLength = strlen(extensionChars);
AsciiToLowerCase(extensionChars, extensionLength, extensionChars);
MOZ_ASSERT(
IsStructurallyValidExtensionTag({extensionChars, extensionLength}));
}
// Any extensions are in alphabetical order by their singleton.
// "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese"
if (!SortAlphabetically(mExtensions)) {
return Err(CanonicalizationError::OutOfMemory);
}
for (UniqueChars& extension : mExtensions) {
if (extension[0] == 'u') {
MOZ_TRY(CanonicalizeUnicodeExtension(extension));
} else if (extension[0] == 't') {
MOZ_TRY(CanonicalizeTransformExtension(extension));
}
MOZ_ASSERT(
IsAsciiLowercaseAlphanumericOrDash(MakeStringSpan(extension.get())));
}
// The canonical case for privateuse subtags is lowercase.
if (char* privateuse = mPrivateUse.get()) {
size_t privateuseLength = strlen(privateuse);
AsciiToLowerCase(privateuse, privateuseLength, privateuse);
MOZ_ASSERT(
IsStructurallyValidPrivateUseTag({privateuse, privateuseLength}));
}
return Ok();
}
template <size_t N>
static inline bool AppendSpan(Vector<char, N>& vector, Span<const char> aSpan) {
return vector.append(aSpan.data(), aSpan.size());
}
/**
* CanonicalizeUnicodeExtension( attributes, keywords )
*
* Canonical syntax per
*
* - All attributes and keywords are in lowercase.
* - Note: The parser already converted keywords to lowercase.
* - All attributes are sorted in alphabetical order.
* - All keywords are sorted by alphabetical order of their keys.
* - Any type value "true" is removed.
*
* Canonical form:
* - All keys and types use the canonical form (from the name attribute;
* see Section 3.6.4 U Extension Data Files).
*/
Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeUnicodeExtension(
UniqueChars& aUnicodeExtension) {
Span<const char> extension = MakeStringSpan(aUnicodeExtension.get());
MOZ_ASSERT(extension[0] == 'u');
MOZ_ASSERT(extension[1] == '-');
MOZ_ASSERT(IsStructurallyValidExtensionTag(extension));
LocaleParser::AttributesVector attributes;
LocaleParser::KeywordsVector keywords;
using Attribute = LocaleParser::AttributesVector::ElementType;
using Keyword = LocaleParser::KeywordsVector::ElementType;
if (LocaleParser::ParseUnicodeExtension(extension, attributes, keywords)
.isErr()) {
MOZ_ASSERT_UNREACHABLE("unexpected invalid Unicode extension subtag");
return Err(CanonicalizationError::InternalError);
}
auto attributesLess = [extension](const Attribute& a, const Attribute& b) {
auto astr = extension.Subspan(a.Begin(), a.Length());
auto bstr = extension.Subspan(b.Begin(), b.Length());
return astr < bstr;
};
// All attributes are sorted in alphabetical order.
if (attributes.length() > 1) {
std::stable_sort(attributes.begin(), attributes.end(), attributesLess);
}
auto keywordsLess = [extension](const Keyword& a, const Keyword& b) {
auto astr = extension.Subspan(a.Begin(), UnicodeKeyLength);
auto bstr = extension.Subspan(b.Begin(), UnicodeKeyLength);
return astr < bstr;
};
// All keywords are sorted by alphabetical order of keys.
if (keywords.length() > 1) {
// Using a stable sort algorithm, guarantees that two keywords using the
// same key are never reordered. That means for example
// when we have the input "u-nu-thai-kf-false-nu-latn", we are guaranteed to
// get the result "u-kf-false-nu-thai-nu-latn", i.e. "nu-thai" still occurs
// before "nu-latn".
// This is required so that deduplication below preserves the first keyword
// for a given key and discards the rest.
std::stable_sort(keywords.begin(), keywords.end(), keywordsLess);
}
Vector<char, 32> sb;
if (!sb.append('u')) {
return Err(CanonicalizationError::OutOfMemory);
}
// Append all Unicode extension attributes.
for (size_t i = 0; i < attributes.length(); i++) {
const auto& attribute = attributes[i];
auto span = extension.Subspan(attribute.Begin(), attribute.Length());
// Skip duplicate attributes.
if (i > 0) {
const auto& lastAttribute = attributes[i - 1];
if (span ==
extension.Subspan(lastAttribute.Begin(), lastAttribute.Length())) {
continue;
}
MOZ_ASSERT(attributesLess(lastAttribute, attribute));
}
if (!sb.append('-')) {
return Err(CanonicalizationError::OutOfMemory);
}
if (!AppendSpan(sb, span)) {
return Err(CanonicalizationError::OutOfMemory);
}
}
static constexpr size_t UnicodeKeyWithSepLength = UnicodeKeyLength + 1;
using StringSpan = Span<const char>;
static constexpr StringSpan True = MakeStringSpan("true");
// Append all Unicode extension keywords.
for (size_t i = 0; i < keywords.length(); i++) {
const auto& keyword = keywords[i];
// Skip duplicate keywords.
if (i > 0) {
const auto& lastKeyword = keywords[i - 1];
if (extension.Subspan(keyword.Begin(), UnicodeKeyLength) ==
extension.Subspan(lastKeyword.Begin(), UnicodeKeyLength)) {
continue;
}
MOZ_ASSERT(keywordsLess(lastKeyword, keyword));
}
if (!sb.append('-')) {
return Err(CanonicalizationError::OutOfMemory);
}
StringSpan span = extension.Subspan(keyword.Begin(), keyword.Length());
if (span.size() == UnicodeKeyLength) {
// Keyword without type value.
if (!AppendSpan(sb, span)) {
return Err(CanonicalizationError::OutOfMemory);
}
} else {
StringSpan key = span.To(UnicodeKeyLength);
StringSpan type = span.From(UnicodeKeyWithSepLength);
// Search if there's a replacement for the current Unicode keyword.
if (const char* replacement = ReplaceUnicodeExtensionType(key, type)) {
StringSpan repl = MakeStringSpan(replacement);
if (repl == True) {
// Elide the type "true" if present in the replacement.
if (!AppendSpan(sb, key)) {
return Err(CanonicalizationError::OutOfMemory);
}
} else {
// Otherwise append the Unicode key (including the separator) and the
// replaced type.
if (!AppendSpan(sb, span.To(UnicodeKeyWithSepLength))) {
return Err(CanonicalizationError::OutOfMemory);
}
if (!AppendSpan(sb, repl)) {
return Err(CanonicalizationError::OutOfMemory);
}
}
} else {
if (type == True) {
// Elide the Unicode extension type "true".
if (!AppendSpan(sb, key)) {
return Err(CanonicalizationError::OutOfMemory);
}
} else {
// Otherwise append the complete Unicode extension keyword.
if (!AppendSpan(sb, span)) {
return Err(CanonicalizationError::OutOfMemory);
}
}
}
}
}
// We can keep the previous extension when canonicalization didn't modify it.
if (static_cast<Span<const char>>(sb) != extension) {
// Otherwise replace the previous extension with the canonical extension.
UniqueChars canonical = DuplicateStringToUniqueChars(sb);
if (!canonical) {
return Err(CanonicalizationError::OutOfMemory);
}
aUnicodeExtension = std::move(canonical);
}
return Ok();
}
template <class Buffer>
static bool LocaleToString(const Locale& aTag, Buffer& aBuffer) {
auto appendSubtag = [&aBuffer](const auto& subtag) {
auto span = subtag.Span();
MOZ_ASSERT(!span.empty());
return aBuffer.append(span.data(), span.size());
};
auto appendSubtagSpan = [&aBuffer](Span<const char> subtag) {
MOZ_ASSERT(!subtag.empty());
return aBuffer.append(subtag.data(), subtag.size());
};
auto appendSubtags = [&aBuffer, &appendSubtagSpan](const auto& subtags) {
for (const auto& subtag : subtags) {
if (!aBuffer.append('-') || !appendSubtagSpan(subtag)) {
return false;
}
}
return true;
};
// Append the language subtag.
if (!appendSubtag(aTag.Language())) {
return false;
}
// Append the script subtag if present.
if (aTag.Script().Present()) {
if (!aBuffer.append('-') || !appendSubtag(aTag.Script())) {
return false;
}
}
// Append the region subtag if present.
if (aTag.Region().Present()) {
if (!aBuffer.append('-') || !appendSubtag(aTag.Region())) {
return false;
}
}
// Append the variant subtags if present.
if (!appendSubtags(aTag.Variants())) {
return false;
}
// Append the extensions subtags if present.
if (!appendSubtags(aTag.Extensions())) {
return false;
}
// Append the private-use subtag if present.
if (auto privateuse = aTag.PrivateUse()) {
if (!aBuffer.append('-') || !appendSubtagSpan(privateuse.value())) {
return false;
}
}
return true;
}
/**
* CanonicalizeTransformExtension
*
*
* - These subtags are all in lowercase (that is the canonical casing for these
* subtags), [...].
*
* And per
*
* - All keywords and tfields are sorted by alphabetical order of their keys,
* within their respective extensions.
*/
Result<Ok, Locale::CanonicalizationError>
Locale::CanonicalizeTransformExtension(UniqueChars& aTransformExtension) {
Span<const char> extension = MakeStringSpan(aTransformExtension.get());
MOZ_ASSERT(extension[0] == 't');
MOZ_ASSERT(extension[1] == '-');
MOZ_ASSERT(IsStructurallyValidExtensionTag(extension));
Locale tag;
LocaleParser::TFieldVector fields;
using TField = LocaleParser::TFieldVector::ElementType;
if (LocaleParser::ParseTransformExtension(extension, tag, fields).isErr()) {
MOZ_ASSERT_UNREACHABLE("unexpected invalid transform extension subtag");
return Err(CanonicalizationError::InternalError);
}
auto tfieldLess = [extension](const TField& a, const TField& b) {
auto astr = extension.Subspan(a.Begin(), TransformKeyLength);
auto bstr = extension.Subspan(b.Begin(), TransformKeyLength);
return astr < bstr;
};
// All tfields are sorted by alphabetical order of their keys.
if (fields.length() > 1) {
std::stable_sort(fields.begin(), fields.end(), tfieldLess);
}
Vector<char, 32> sb;
if (!sb.append('t')) {
return Err(CanonicalizationError::OutOfMemory);
}
// Append the language subtag if present.
//
// Replace aliases in tlang per
if (tag.Language().Present()) {
if (!sb.append('-')) {
return Err(CanonicalizationError::OutOfMemory);
}
MOZ_TRY(tag.CanonicalizeBaseName());
// The canonical case for Transform extensions is lowercase per
// subtags which don't use lowercase for their canonical syntax.
tag.mScript.ToLowerCase();
tag.mRegion.ToLowerCase();
if (!LocaleToString(tag, sb)) {
return Err(CanonicalizationError::OutOfMemory);
}
}
static constexpr size_t TransformKeyWithSepLength = TransformKeyLength + 1;
using StringSpan = Span<const char>;
// Append all fields.
//
// UTS 35, 3.2.1 specifies:
// - Any type or tfield value "true" is removed.
//
// But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so ignore
// this apparently invalid part of the UTS 35 specification and simply
// append all `tfield` subtags.
for (const auto& field : fields) {
if (!sb.append('-')) {
return Err(CanonicalizationError::OutOfMemory);
}
StringSpan span = extension.Subspan(field.Begin(), field.Length());
StringSpan key = span.To(TransformKeyLength);
StringSpan value = span.From(TransformKeyWithSepLength);
// Search if there's a replacement for the current transform keyword.
if (const char* replacement = ReplaceTransformExtensionType(key, value)) {
if (!AppendSpan(sb, span.To(TransformKeyWithSepLength))) {
return Err(CanonicalizationError::OutOfMemory);
}
if (!AppendSpan(sb, MakeStringSpan(replacement))) {
return Err(CanonicalizationError::OutOfMemory);
}
} else {
if (!AppendSpan(sb, span)) {
return Err(CanonicalizationError::OutOfMemory);
}
}
}
// We can keep the previous extension when canonicalization didn't modify it.
if (static_cast<Span<const char>>(sb) != extension) {
// Otherwise replace the previous extension with the canonical extension.
UniqueChars canonical = DuplicateStringToUniqueChars(sb);
if (!canonical) {
return Err(CanonicalizationError::OutOfMemory);
}
aTransformExtension = std::move(canonical);
}
return Ok();
}
// Zero-terminated ICU Locale ID.
using LocaleId =
Vector<char, LanguageLength + 1 + ScriptLength + 1 + RegionLength + 1>;
enum class LikelySubtags : bool { Add, Remove };
// Return true iff the locale is already maximized resp. minimized.
static bool HasLikelySubtags(LikelySubtags aLikelySubtags, const Locale& aTag) {
// The locale is already maximized if the language, script, and region
// subtags are present and no placeholder subtags ("und", "Zzzz", "ZZ") are
// used.
if (aLikelySubtags == LikelySubtags::Add) {
return !aTag.Language().EqualTo("und") &&
(aTag.Script().Present() && !aTag.Script().EqualTo("Zzzz")) &&
(aTag.Region().Present() && !aTag.Region().EqualTo("ZZ"));
}
// The locale is already minimized if it only contains a language
// subtag whose value is not the placeholder value "und".
return !aTag.Language().EqualTo("und") && aTag.Script().Missing() &&
aTag.Region().Missing();
}
// Create an ICU locale ID from the given locale.
static bool CreateLocaleForLikelySubtags(const Locale& aTag,
LocaleId& aLocale) {
MOZ_ASSERT(aLocale.length() == 0);
auto appendSubtag = [&aLocale](const auto& subtag) {
auto span = subtag.Span();
MOZ_ASSERT(!span.empty());
return aLocale.append(span.data(), span.size());
};
// Append the language subtag.
if (!appendSubtag(aTag.Language())) {
return false;
}
// Append the script subtag if present.
if (aTag.Script().Present()) {
if (!aLocale.append('_') || !appendSubtag(aTag.Script())) {
return false;
}
}
// Append the region subtag if present.
if (aTag.Region().Present()) {
if (!aLocale.append('_') || !appendSubtag(aTag.Region())) {
return false;
}
}
// Zero-terminated for use with ICU.
return aLocale.append('\0');
}
static ICUError ParserErrorToICUError(LocaleParser::ParserError aErr) {
using ParserError = LocaleParser::ParserError;
switch (aErr) {
case ParserError::NotParseable:
return ICUError::InternalError;
case ParserError::OutOfMemory:
return ICUError::OutOfMemory;
}
MOZ_CRASH("Unexpected parser error");
}
static ICUError CanonicalizationErrorToICUError(
Locale::CanonicalizationError aErr) {
using CanonicalizationError = Locale::CanonicalizationError;
switch (aErr) {
case CanonicalizationError::DuplicateVariant:
case CanonicalizationError::InternalError:
return ICUError::InternalError;
case CanonicalizationError::OutOfMemory:
return ICUError::OutOfMemory;
}
MOZ_CRASH("Unexpected canonicalization error");
}
// Assign the language, script, and region subtags from an ICU locale ID.
//
// ICU provides |uloc_getLanguage|, |uloc_getScript|, and |uloc_getCountry| to
// retrieve these subtags, but unfortunately these functions are rather slow, so
// we use our own implementation.
static ICUResult AssignFromLocaleId(LocaleId& aLocaleId, Locale& aTag) {
// Replace the ICU locale ID separator.
std::replace(aLocaleId.begin(), aLocaleId.end(), '_', '-');
// ICU replaces "und" with the empty string, which means "und" becomes "" and
// "und-Latn" becomes "-Latn". Handle this case separately.
if (aLocaleId.empty() || aLocaleId[0] == '-') {
static constexpr auto und = MakeStringSpan("und");
constexpr size_t length = und.size();
// Insert "und" in front of the locale ID.
if (!aLocaleId.growBy(length)) {
return Err(ICUError::OutOfMemory);
}
memmove(aLocaleId.begin() + length, aLocaleId.begin(), aLocaleId.length());
memmove(aLocaleId.begin(), und.data(), length);
}
// Retrieve the language, script, and region subtags from the locale ID
Locale localeTag;
MOZ_TRY(LocaleParser::TryParseBaseName(aLocaleId, localeTag)
.mapErr(ParserErrorToICUError));
aTag.SetLanguage(localeTag.Language());
aTag.SetScript(localeTag.Script());
aTag.SetRegion(localeTag.Region());
return Ok();
}
template <decltype(uloc_addLikelySubtags) likelySubtagsFn>
static ICUResult CallLikelySubtags(const LocaleId& aLocaleId,
LocaleId& aResult) {
// Locale ID must be zero-terminated before passing it to ICU.
MOZ_ASSERT(aLocaleId.back() == '\0');
MOZ_ASSERT(aResult.length() == 0);
// Ensure there's enough room for the result.
MOZ_ALWAYS_TRUE(aResult.resize(LocaleId::InlineLength));
return FillBufferWithICUCall(
aResult, [&aLocaleId](char* chars, int32_t size, UErrorCode* status) {
return likelySubtagsFn(aLocaleId.begin(), chars, size, status);
});
}
// The canonical way to compute the Unicode BCP 47 locale identifier with likely
// subtags is as follows:
//
// 1. Call uloc_forLanguageTag() to transform the locale identifer into an ICU
// locale ID.
// 2. Call uloc_addLikelySubtags() to add the likely subtags to the locale ID.
// 3. Call uloc_toLanguageTag() to transform the resulting locale ID back into
// a Unicode BCP 47 locale identifier.
//
// Since uloc_forLanguageTag() and uloc_toLanguageTag() are both kind of slow
// and we know, by construction, that the input Unicode BCP 47 locale identifier
// only contains valid language, script, and region subtags, we can avoid both
// calls if we implement them ourselves, see CreateLocaleForLikelySubtags() and
// AssignFromLocaleId(). (Where "slow" means about 50% of the execution time of
// |Intl.Locale.prototype.maximize|.)
static ICUResult LikelySubtags(LikelySubtags aLikelySubtags, Locale& aTag) {
// Return early if the input is already maximized/minimized.
if (HasLikelySubtags(aLikelySubtags, aTag)) {
return Ok();
}
// Create the locale ID for the input argument.
LocaleId locale;
if (!CreateLocaleForLikelySubtags(aTag, locale)) {
return Err(ICUError::OutOfMemory);
}
// Either add or remove likely subtags to/from the locale ID.
LocaleId localeLikelySubtags;
if (aLikelySubtags == LikelySubtags::Add) {
MOZ_TRY(
CallLikelySubtags<uloc_addLikelySubtags>(locale, localeLikelySubtags));
} else {
MOZ_TRY(
CallLikelySubtags<uloc_minimizeSubtags>(locale, localeLikelySubtags));
}
// Assign the language, script, and region subtags from the locale ID.
MOZ_TRY(AssignFromLocaleId(localeLikelySubtags, aTag));
// Update mappings in case ICU returned a non-canonical locale.
MOZ_TRY(aTag.CanonicalizeBaseName().mapErr(CanonicalizationErrorToICUError));
return Ok();
}
ICUResult Locale::AddLikelySubtags() {
return LikelySubtags(LikelySubtags::Add, *this);
}
ICUResult Locale::RemoveLikelySubtags() {
return LikelySubtags(LikelySubtags::Remove, *this);
}
UniqueChars Locale::DuplicateStringToUniqueChars(const char* aStr) {
size_t length = strlen(aStr) + 1;
auto duplicate = MakeUnique<char[]>(length);
memcpy(duplicate.get(), aStr, length);
return duplicate;
}
UniqueChars Locale::DuplicateStringToUniqueChars(Span<const char> aStr) {
size_t length = aStr.size();
auto duplicate = MakeUnique<char[]>(length + 1);
memcpy(duplicate.get(), aStr.data(), length);
duplicate[length] = '\0';
return duplicate;
}
size_t Locale::ToStringCapacity() const {
// This is a bit awkward, the buffer class currently does not support
// being resized, so we need to calculate the required size up front and
// reserve it all at once.
auto lengthSubtag = [](const auto& subtag) {
auto span = subtag.Span();
MOZ_ASSERT(!span.empty());
return span.size();
};
auto lengthSubtagZ = [](const char* subtag) {
size_t length = strlen(subtag);
MOZ_ASSERT(length > 0);
return length;
};
auto lengthSubtagsZ = [&lengthSubtagZ](const auto& subtags) {
size_t length = 0;
for (const auto& subtag : subtags) {
length += lengthSubtagZ(subtag.get()) + 1;
}
return length;
};
// First calculate required capacity
size_t capacity = 0;
capacity += lengthSubtag(mLanguage);
if (mScript.Present()) {
capacity += lengthSubtag(mScript) + 1;
}
if (mRegion.Present()) {
capacity += lengthSubtag(mRegion) + 1;
}
capacity += lengthSubtagsZ(mVariants);
capacity += lengthSubtagsZ(mExtensions);
if (mPrivateUse.get()) {
capacity += lengthSubtagZ(mPrivateUse.get()) + 1;
}
return capacity;
}
size_t Locale::ToStringAppend(char* aBuffer) const {
// Current write position inside buffer.
size_t offset = 0;
auto appendHyphen = [&offset, &aBuffer]() {
aBuffer[offset] = '-';
offset += 1;
};
auto appendSubtag = [&offset, &aBuffer](const auto& subtag) {
auto span = subtag.Span();
memcpy(aBuffer + offset, span.data(), span.size());
offset += span.size();
};
auto appendSubtagZ = [&offset, &aBuffer](const char* subtag) {
size_t length = strlen(subtag);
memcpy(aBuffer + offset, subtag, length);
offset += length;
};
auto appendSubtagsZ = [&appendHyphen, &appendSubtagZ](const auto& subtags) {
for (const auto& subtag : subtags) {
appendHyphen();
appendSubtagZ(subtag.get());
}
};
// Append the language subtag.
appendSubtag(mLanguage);
// Append the script subtag if present.
if (mScript.Present()) {
appendHyphen();
appendSubtag(mScript);
}
// Append the region subtag if present.
if (mRegion.Present()) {
appendHyphen();
appendSubtag(mRegion);
}
// Append the variant subtags if present.
appendSubtagsZ(mVariants);
// Append the extensions subtags if present.
appendSubtagsZ(mExtensions);
// Append the private-use subtag if present.
if (mPrivateUse.get()) {
appendHyphen();
appendSubtagZ(mPrivateUse.get());
}
return offset;
}
LocaleParser::Token LocaleParser::NextToken() {
MOZ_ASSERT(mIndex <= mLength + 1, "called after 'None' token was read");
TokenKind kind = TokenKind::None;
size_t tokenLength = 0;
for (size_t i = mIndex; i < mLength; i++) {
// UTS 35, section 3.1.
// alpha = [A-Z a-z] ;
// digit = [0-9] ;
char c = CharAt(i);
if (IsAsciiAlpha(c)) {
kind |= TokenKind::Alpha;
} else if (IsAsciiDigit(c)) {
kind |= TokenKind::Digit;
} else if (c == '-' && i > mIndex && i + 1 < mLength) {
break;
} else {
return {TokenKind::Error, 0, 0};
}
tokenLength += 1;
}
Token token{kind, mIndex, tokenLength};
mIndex += tokenLength + 1;
return token;
}
UniqueChars LocaleParser::Chars(size_t aIndex, size_t aLength) const {
// Add +1 to null-terminate the string.
auto chars = MakeUnique<char[]>(aLength + 1);
char* dest = chars.get();
std::copy_n(mLocale + aIndex, aLength, dest);
dest[aLength] = '\0';
return chars;
}
// Parse the `unicode_language_id` production.
//
// unicode_language_id = unicode_language_subtag
// (sep unicode_script_subtag)?
// (sep unicode_region_subtag)?
// (sep unicode_variant_subtag)* ;
//
// sep = "-"
//
// Note: Unicode CLDR locale identifier backward compatibility extensions
// removed from `unicode_language_id`.
//
// |tok| is the current token from |ts|.
//
// All subtags will be added unaltered to |tag|, without canonicalizing their
// case or, in the case of variant subtags, detecting and rejecting duplicate
// variants. Users must subsequently |CanonicalizeBaseName| to perform these
// actions.
//
// Do not use this function directly: use |ParseBaseName| or
// |ParseTlangFromTransformExtension| instead.
Result<Ok, LocaleParser::ParserError> LocaleParser::InternalParseBaseName(
LocaleParser& aLocaleParser, Locale& aTag, Token& aTok) {
if (aLocaleParser.IsLanguage(aTok)) {
aLocaleParser.CopyChars(aTok, aTag.mLanguage);
aTok = aLocaleParser.NextToken();
} else {
// The language subtag is mandatory.
return Err(ParserError::NotParseable);
}
if (aLocaleParser.IsScript(aTok)) {
aLocaleParser.CopyChars(aTok, aTag.mScript);
aTok = aLocaleParser.NextToken();
}
if (aLocaleParser.IsRegion(aTok)) {
aLocaleParser.CopyChars(aTok, aTag.mRegion);
aTok = aLocaleParser.NextToken();
}
auto& variants = aTag.mVariants;
MOZ_ASSERT(variants.length() == 0);
while (aLocaleParser.IsVariant(aTok)) {
auto variant = aLocaleParser.Chars(aTok);
if (!variants.append(std::move(variant))) {
return Err(ParserError::OutOfMemory);
}
aTok = aLocaleParser.NextToken();
}
return Ok();
}
Result<Ok, LocaleParser::ParserError> LocaleParser::TryParse(
mozilla::Span<const char> aLocale, Locale& aTag) {
// |aTag| must be a new, empty Locale.
MOZ_ASSERT(aTag.Language().Missing());
MOZ_ASSERT(aTag.Script().Missing());
MOZ_ASSERT(aTag.Region().Missing());
MOZ_ASSERT(aTag.Variants().empty());
MOZ_ASSERT(aTag.Extensions().empty());
MOZ_ASSERT(aTag.PrivateUse().isNothing());
// unicode_locale_id = unicode_language_id
// extensions*
// pu_extensions? ;
LocaleParser ts(aLocale);
Token tok = ts.NextToken();
MOZ_TRY(ParseBaseName(ts, aTag, tok));
// extensions = unicode_locale_extensions
// | transformed_extensions
// | other_extensions ;
// Bit set of seen singletons.
uint64_t seenSingletons = 0;
auto& extensions = aTag.mExtensions;
while (ts.IsExtensionStart(tok)) {
char singleton = ts.SingletonKey(tok);
// Reject the input if a duplicate singleton was found.
uint64_t hash = 1ULL << (AsciiAlphanumericToNumber(singleton) + 1);
if (seenSingletons & hash) {
return Err(ParserError::NotParseable);
}
seenSingletons |= hash;
Token start = tok;
tok = ts.NextToken();
// We'll check for missing non-singleton subtags after this block by
// comparing |startValue| with the then-current position.
size_t startValue = tok.Index();
if (singleton == 'u') {
while (ts.IsUnicodeExtensionPart(tok)) {
tok = ts.NextToken();
}
} else if (singleton == 't') {
// transformed_extensions = sep [tT]
// ((sep tlang (sep tfield)*)
// | (sep tfield)+) ;
// tlang = unicode_language_subtag
// (sep unicode_script_subtag)?
// (sep unicode_region_subtag)?
// (sep unicode_variant_subtag)* ;
if (ts.IsLanguage(tok)) {
tok = ts.NextToken();
if (ts.IsScript(tok)) {
tok = ts.NextToken();
}
if (ts.IsRegion(tok)) {
tok = ts.NextToken();
}
while (ts.IsVariant(tok)) {
tok = ts.NextToken();
}
}
// tfield = tkey tvalue;
while (ts.IsTransformExtensionKey(tok)) {
tok = ts.NextToken();
size_t startTValue = tok.Index();
while (ts.IsTransformExtensionPart(tok)) {
tok = ts.NextToken();
}
// `tfield` requires at least one `tvalue`.
if (tok.Index() <= startTValue) {
return Err(ParserError::NotParseable);
}
}
} else {
while (ts.IsOtherExtensionPart(tok)) {
tok = ts.NextToken();
}
}
// Singletons must be followed by a non-singleton subtag, "en-a-b" is not
// allowed.
if (tok.Index() <= startValue) {
return Err(ParserError::NotParseable);
}
UniqueChars extension = ts.Extension(start, tok);
if (!extensions.append(std::move(extension))) {
return Err(ParserError::OutOfMemory);
}
}
// Trailing `pu_extension` component of the `unicode_locale_id` production.
if (ts.IsPrivateUseStart(tok)) {
Token start = tok;
tok = ts.NextToken();
size_t startValue = tok.Index();
while (ts.IsPrivateUsePart(tok)) {
tok = ts.NextToken();
}
// There must be at least one subtag after the "-x-".
if (tok.Index() <= startValue) {
return Err(ParserError::NotParseable);
}
UniqueChars privateUse = ts.Extension(start, tok);
aTag.mPrivateUse = std::move(privateUse);
}
if (!tok.IsNone()) {
return Err(ParserError::NotParseable);
}
return Ok();
}
Result<Ok, LocaleParser::ParserError> LocaleParser::TryParseBaseName(
Span<const char> aLocale, Locale& aTag) {
// |aTag| must be a new, empty Locale.
MOZ_ASSERT(aTag.Language().Missing());
MOZ_ASSERT(aTag.Script().Missing());
MOZ_ASSERT(aTag.Region().Missing());
MOZ_ASSERT(aTag.Variants().empty());
MOZ_ASSERT(aTag.Extensions().empty());
MOZ_ASSERT(aTag.PrivateUse().isNothing());
LocaleParser ts(aLocale);
Token tok = ts.NextToken();
MOZ_TRY(ParseBaseName(ts, aTag, tok));
if (!tok.IsNone()) {
return Err(ParserError::NotParseable);
}
return Ok();
}
// Parse |aExtension|, which must be a valid `transformed_extensions` subtag,
// and fill |aTag| and |aFields| from the `tlang` and `tfield` components.
Result<Ok, LocaleParser::ParserError> LocaleParser::ParseTransformExtension(
Span<const char> aExtension, Locale& aTag, TFieldVector& aFields) {
LocaleParser ts(aExtension);
Token tok = ts.NextToken();
if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 't') {
return Err(ParserError::NotParseable);
}
tok = ts.NextToken();
if (tok.IsNone()) {
return Err(ParserError::NotParseable);
}
if (ts.IsLanguage(tok)) {
// We're parsing a possible `tlang` in a known-valid transform extension, so
// use the special-purpose function that takes advantage of this to compute
// lowercased |tag| contents in an optimal manner.
MOZ_TRY(ParseTlangInTransformExtension(ts, aTag, tok));
// After `tlang` we must have a `tfield` and its `tkey`, or we're at the end
// of the transform extension.
MOZ_ASSERT(ts.IsTransformExtensionKey(tok) || tok.IsNone());
} else {
// If there's no `tlang` subtag, at least one `tfield` must be present.
MOZ_ASSERT(ts.IsTransformExtensionKey(tok));
}
// Trailing `tfield` subtags. (Any other trailing subtags are an error,
// because we're guaranteed to only see a valid tranform extension here.)
while (ts.IsTransformExtensionKey(tok)) {
size_t begin = tok.Index();
tok = ts.NextToken();
size_t startTValue = tok.Index();
while (ts.IsTransformExtensionPart(tok)) {
tok = ts.NextToken();
}
// `tfield` requires at least one `tvalue`.
if (tok.Index() <= startTValue) {
return Err(ParserError::NotParseable);
}
size_t length = tok.Index() - 1 - begin;
if (!aFields.emplaceBack(begin, length)) {
return Err(ParserError::OutOfMemory);
}
}
if (!tok.IsNone()) {
return Err(ParserError::NotParseable);
}
return Ok();
}
// Parse |aExtension|, which must be a valid `unicode_locale_extensions` subtag,
// and fill |aAttributes| and |aKeywords| from the `attribute` and `keyword`
// components.
Result<Ok, LocaleParser::ParserError> LocaleParser::ParseUnicodeExtension(
Span<const char> aExtension, AttributesVector& aAttributes,
KeywordsVector& aKeywords) {
LocaleParser ts(aExtension);
Token tok = ts.NextToken();
// unicode_locale_extensions = sep [uU] ((sep keyword)+ |
// (sep attribute)+ (sep keyword)*) ;
if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 'u') {
return Err(ParserError::NotParseable);
}
tok = ts.NextToken();
if (tok.IsNone()) {
return Err(ParserError::NotParseable);
}
while (ts.IsUnicodeExtensionAttribute(tok)) {
if (!aAttributes.emplaceBack(tok.Index(), tok.Length())) {
return Err(ParserError::OutOfMemory);
}
tok = ts.NextToken();
}
// keyword = key (sep type)? ;
while (ts.IsUnicodeExtensionKey(tok)) {
size_t begin = tok.Index();
tok = ts.NextToken();
while (ts.IsUnicodeExtensionType(tok)) {
tok = ts.NextToken();
}
if (tok.IsError()) {
return Err(ParserError::NotParseable);
}
size_t length = tok.Index() - 1 - begin;
if (!aKeywords.emplaceBack(begin, length)) {
return Err(ParserError::OutOfMemory);
}
}
if (!tok.IsNone()) {
return Err(ParserError::NotParseable);
}
return Ok();
}
Result<Ok, LocaleParser::ParserError> LocaleParser::CanParseUnicodeExtension(
Span<const char> aExtension) {
LocaleParser ts(aExtension);
Token tok = ts.NextToken();
// unicode_locale_extensions = sep [uU] ((sep keyword)+ |
// (sep attribute)+ (sep keyword)*) ;
if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 'u') {
return Err(ParserError::NotParseable);
}
tok = ts.NextToken();
if (tok.IsNone()) {
return Err(ParserError::NotParseable);
}
while (ts.IsUnicodeExtensionAttribute(tok)) {
tok = ts.NextToken();
}
// keyword = key (sep type)? ;
while (ts.IsUnicodeExtensionKey(tok)) {
tok = ts.NextToken();
while (ts.IsUnicodeExtensionType(tok)) {
tok = ts.NextToken();
}
if (tok.IsError()) {
return Err(ParserError::NotParseable);
}
}
if (!tok.IsNone()) {
return Err(ParserError::OutOfMemory);
}
return Ok();
}
Result<Ok, LocaleParser::ParserError>
LocaleParser::CanParseUnicodeExtensionType(Span<const char> aUnicodeType) {
MOZ_ASSERT(!aUnicodeType.empty(), "caller must exclude empty strings");
LocaleParser ts(aUnicodeType);
Token tok = ts.NextToken();
while (ts.IsUnicodeExtensionType(tok)) {
tok = ts.NextToken();
}
if (!tok.IsNone()) {
return Err(ParserError::NotParseable);
}
return Ok();
}
} // namespace mozilla::intl