Source code

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef intl_components_String_h_
#define intl_components_String_h_
#include "mozilla/Assertions.h"
#include "mozilla/Casting.h"
#include "mozilla/intl/ICU4CGlue.h"
#include "mozilla/intl/ICUError.h"
#include "mozilla/PodOperations.h"
#include "mozilla/Span.h"
#include "mozilla/Try.h"
#include "unicode/uchar.h"
#include "unicode/unorm2.h"
#include "unicode/ustring.h"
#include "unicode/utext.h"
#include "unicode/utypes.h"
namespace mozilla::intl {
/**
* This component is a Mozilla-focused API for working with strings in
* internationalization code.
*/
class String final {
public:
String() = delete;
/**
* Return the locale-sensitive lower case string of the input.
*/
template <typename B>
static Result<Ok, ICUError> ToLocaleLowerCase(const char* aLocale,
Span<const char16_t> aString,
B& aBuffer) {
if (!aBuffer.reserve(aString.size())) {
return Err(ICUError::OutOfMemory);
}
return FillBufferWithICUCall(
aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
return u_strToLower(target, length, aString.data(), aString.size(),
aLocale, status);
});
}
/**
* Return the locale-sensitive upper case string of the input.
*/
template <typename B>
static Result<Ok, ICUError> ToLocaleUpperCase(const char* aLocale,
Span<const char16_t> aString,
B& aBuffer) {
if (!aBuffer.reserve(aString.size())) {
return Err(ICUError::OutOfMemory);
}
return FillBufferWithICUCall(
aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
return u_strToUpper(target, length, aString.data(), aString.size(),
aLocale, status);
});
}
/**
* Normalization form constants to describe which normalization algorithm
* should be performed.
*
* Also see:
* - Unicode Standard, §2.12 Equivalent Sequences
* - Unicode Standard, §3.11 Normalization Forms
*/
enum class NormalizationForm {
/**
* Normalization Form C
*/
NFC,
/**
* Normalization Form D
*/
NFD,
/**
* Normalization Form KC
*/
NFKC,
/**
* Normalization Form KD
*/
NFKD,
};
enum class AlreadyNormalized : bool { No, Yes };
/**
* Normalize the input string according to requested normalization form.
*
* Returns `AlreadyNormalized::Yes` when the string is already in normalized
* form. The output buffer is unchanged in this case. Otherwise returns
* `AlreadyNormalized::No` and places the normalized string into the output
* buffer.
*/
template <typename B>
static Result<AlreadyNormalized, ICUError> Normalize(
NormalizationForm aForm, Span<const char16_t> aString, B& aBuffer) {
// The unorm2_getXXXInstance() methods return a shared instance which must
// not be deleted.
UErrorCode status = U_ZERO_ERROR;
const UNormalizer2* normalizer;
switch (aForm) {
case NormalizationForm::NFC:
normalizer = unorm2_getNFCInstance(&status);
break;
case NormalizationForm::NFD:
normalizer = unorm2_getNFDInstance(&status);
break;
case NormalizationForm::NFKC:
normalizer = unorm2_getNFKCInstance(&status);
break;
case NormalizationForm::NFKD:
normalizer = unorm2_getNFKDInstance(&status);
break;
}
if (U_FAILURE(status)) {
return Err(ToICUError(status));
}
int32_t spanLengthInt = unorm2_spanQuickCheckYes(normalizer, aString.data(),
aString.size(), &status);
if (U_FAILURE(status)) {
return Err(ToICUError(status));
}
size_t spanLength = AssertedCast<size_t>(spanLengthInt);
MOZ_ASSERT(spanLength <= aString.size());
// Return if the input string is already normalized.
if (spanLength == aString.size()) {
return AlreadyNormalized::Yes;
}
if (!aBuffer.reserve(aString.size())) {
return Err(ICUError::OutOfMemory);
}
// Copy the already normalized prefix.
if (spanLength > 0) {
PodCopy(aBuffer.data(), aString.data(), spanLength);
aBuffer.written(spanLength);
}
MOZ_TRY(FillBufferWithICUCall(
aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
Span<const char16_t> remaining = aString.From(spanLength);
return unorm2_normalizeSecondAndAppend(normalizer, target, spanLength,
length, remaining.data(),
remaining.size(), status);
}));
return AlreadyNormalized::No;
}
/**
* Return true if the code point has the binary property "Cased".
*/
static bool IsCased(char32_t codePoint) {
return u_hasBinaryProperty(static_cast<UChar32>(codePoint), UCHAR_CASED);
}
/**
* Return true if the code point has the binary property "Case_Ignorable".
*/
static bool IsCaseIgnorable(char32_t codePoint) {
return u_hasBinaryProperty(static_cast<UChar32>(codePoint),
UCHAR_CASE_IGNORABLE);
}
/**
* Return the NFC pairwise composition of the two input characters, if any;
* returns 0 (which we know is not a composed char!) if none exists.
*/
static char32_t ComposePairNFC(char32_t a, char32_t b) {
// unorm2_getNFCInstance returns a static instance that does not have to be
// released here. If it fails, we just return 0 (no composition) always.
static UErrorCode status = U_ZERO_ERROR;
static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
if (U_FAILURE(status)) {
return 0;
}
UChar32 ch = unorm2_composePair(normalizer, static_cast<UChar32>(a),
static_cast<UChar32>(b));
return ch < 0 ? 0 : static_cast<char32_t>(ch);
}
/**
* Put the "raw" (single-level) canonical decomposition of the input char, if
* any, into the provided buffer. Canonical decomps are never more than two
* chars in length (although full normalization may result in longer output
* due to recursion).
* Returns the length of the decomposition (0 if none, else 1 or 2).
*/
static int DecomposeRawNFD(char32_t ab, char32_t decomp[2]) {
// unorm2_getNFCInstance returns a static instance that does not have to be
// released here. If it fails, we just return 0 (no decomposition) always.
// Although we are using it to query for a decomposition, the mode of the
// Normalizer2 is irrelevant here, so we may as well use the same singleton
// instance as ComposePairNFC.
static UErrorCode status = U_ZERO_ERROR;
static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
if (U_FAILURE(status)) {
return 0;
}
// Canonical decompositions are never more than two Unicode characters,
// or a maximum of 4 utf-16 code units.
const unsigned MAX_DECOMP_LENGTH = 4;
UErrorCode error = U_ZERO_ERROR;
UChar decompUtf16[MAX_DECOMP_LENGTH];
int32_t len =
unorm2_getRawDecomposition(normalizer, static_cast<UChar32>(ab),
decompUtf16, MAX_DECOMP_LENGTH, &error);
if (U_FAILURE(error) || len < 0) {
return 0;
}
UText text = UTEXT_INITIALIZER;
utext_openUChars(&text, decompUtf16, len, &error);
MOZ_ASSERT(U_SUCCESS(error));
UChar32 ch = UTEXT_NEXT32(&text);
len = 0;
if (ch != U_SENTINEL) {
decomp[0] = static_cast<char32_t>(ch);
++len;
ch = UTEXT_NEXT32(&text);
if (ch != U_SENTINEL) {
decomp[1] = static_cast<char32_t>(ch);
++len;
}
}
utext_close(&text);
return len;
}
/**
* Return the Unicode version, for example "13.0".
*/
static Span<const char> GetUnicodeVersion();
};
} // namespace mozilla::intl
#endif