428 lines
13 KiB
C++
428 lines
13 KiB
C++
|
// © 2018 and later: Unicode, Inc. and others.
|
||
|
// License & terms of use: http://www.unicode.org/copyright.html
|
||
|
|
||
|
// characterproperties.cpp
|
||
|
// created: 2018sep03 Markus W. Scherer
|
||
|
|
||
|
#include "unicode/utypes.h"
|
||
|
#include "unicode/localpointer.h"
|
||
|
#include "unicode/uchar.h"
|
||
|
#include "unicode/ucpmap.h"
|
||
|
#include "unicode/ucptrie.h"
|
||
|
#include "unicode/umutablecptrie.h"
|
||
|
#include "unicode/uniset.h"
|
||
|
#include "unicode/uscript.h"
|
||
|
#include "unicode/uset.h"
|
||
|
#include "cmemory.h"
|
||
|
#include "emojiprops.h"
|
||
|
#include "mutex.h"
|
||
|
#include "normalizer2impl.h"
|
||
|
#include "uassert.h"
|
||
|
#include "ubidi_props.h"
|
||
|
#include "ucase.h"
|
||
|
#include "ucln_cmn.h"
|
||
|
#include "umutex.h"
|
||
|
#include "uprops.h"
|
||
|
|
||
|
using icu::LocalPointer;
|
||
|
#if !UCONFIG_NO_NORMALIZATION
|
||
|
using icu::Normalizer2Factory;
|
||
|
using icu::Normalizer2Impl;
|
||
|
#endif
|
||
|
using icu::UInitOnce;
|
||
|
using icu::UnicodeSet;
|
||
|
|
||
|
namespace {
|
||
|
|
||
|
UBool U_CALLCONV characterproperties_cleanup();
|
||
|
|
||
|
constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);
|
||
|
|
||
|
struct Inclusion {
|
||
|
UnicodeSet *fSet = nullptr;
|
||
|
UInitOnce fInitOnce {};
|
||
|
};
|
||
|
Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
|
||
|
|
||
|
UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
|
||
|
|
||
|
UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
|
||
|
|
||
|
icu::UMutex cpMutex;
|
||
|
|
||
|
//----------------------------------------------------------------
|
||
|
// Inclusions list
|
||
|
//----------------------------------------------------------------
|
||
|
|
||
|
// USetAdder implementation
|
||
|
// Does not use uset.h to reduce code dependencies
|
||
|
void U_CALLCONV
|
||
|
_set_add(USet *set, UChar32 c) {
|
||
|
((UnicodeSet *)set)->add(c);
|
||
|
}
|
||
|
|
||
|
void U_CALLCONV
|
||
|
_set_addRange(USet *set, UChar32 start, UChar32 end) {
|
||
|
((UnicodeSet *)set)->add(start, end);
|
||
|
}
|
||
|
|
||
|
void U_CALLCONV
|
||
|
_set_addString(USet *set, const char16_t *str, int32_t length) {
|
||
|
((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
|
||
|
}
|
||
|
|
||
|
UBool U_CALLCONV characterproperties_cleanup() {
|
||
|
for (Inclusion &in: gInclusions) {
|
||
|
delete in.fSet;
|
||
|
in.fSet = nullptr;
|
||
|
in.fInitOnce.reset();
|
||
|
}
|
||
|
for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
|
||
|
delete sets[i];
|
||
|
sets[i] = nullptr;
|
||
|
}
|
||
|
for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
|
||
|
ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
|
||
|
maps[i] = nullptr;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
|
||
|
// This function is invoked only via umtx_initOnce().
|
||
|
U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
|
||
|
if (src == UPROPS_SRC_NONE) {
|
||
|
errorCode = U_INTERNAL_PROGRAM_ERROR;
|
||
|
return;
|
||
|
}
|
||
|
U_ASSERT(gInclusions[src].fSet == nullptr);
|
||
|
|
||
|
LocalPointer<UnicodeSet> incl(new UnicodeSet());
|
||
|
if (incl.isNull()) {
|
||
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||
|
return;
|
||
|
}
|
||
|
USetAdder sa = {
|
||
|
(USet *)incl.getAlias(),
|
||
|
_set_add,
|
||
|
_set_addRange,
|
||
|
_set_addString,
|
||
|
nullptr, // don't need remove()
|
||
|
nullptr // don't need removeRange()
|
||
|
};
|
||
|
|
||
|
switch(src) {
|
||
|
case UPROPS_SRC_CHAR:
|
||
|
uchar_addPropertyStarts(&sa, &errorCode);
|
||
|
break;
|
||
|
case UPROPS_SRC_PROPSVEC:
|
||
|
upropsvec_addPropertyStarts(&sa, &errorCode);
|
||
|
break;
|
||
|
case UPROPS_SRC_CHAR_AND_PROPSVEC:
|
||
|
uchar_addPropertyStarts(&sa, &errorCode);
|
||
|
upropsvec_addPropertyStarts(&sa, &errorCode);
|
||
|
break;
|
||
|
#if !UCONFIG_NO_NORMALIZATION
|
||
|
case UPROPS_SRC_CASE_AND_NORM: {
|
||
|
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
||
|
if(U_SUCCESS(errorCode)) {
|
||
|
impl->addPropertyStarts(&sa, errorCode);
|
||
|
}
|
||
|
ucase_addPropertyStarts(&sa, &errorCode);
|
||
|
break;
|
||
|
}
|
||
|
case UPROPS_SRC_NFC: {
|
||
|
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
||
|
if(U_SUCCESS(errorCode)) {
|
||
|
impl->addPropertyStarts(&sa, errorCode);
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case UPROPS_SRC_NFKC: {
|
||
|
const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
|
||
|
if(U_SUCCESS(errorCode)) {
|
||
|
impl->addPropertyStarts(&sa, errorCode);
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case UPROPS_SRC_NFKC_CF: {
|
||
|
const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
|
||
|
if(U_SUCCESS(errorCode)) {
|
||
|
impl->addPropertyStarts(&sa, errorCode);
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case UPROPS_SRC_NFC_CANON_ITER: {
|
||
|
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
||
|
if(U_SUCCESS(errorCode)) {
|
||
|
impl->addCanonIterPropertyStarts(&sa, errorCode);
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
#endif
|
||
|
case UPROPS_SRC_CASE:
|
||
|
ucase_addPropertyStarts(&sa, &errorCode);
|
||
|
break;
|
||
|
case UPROPS_SRC_BIDI:
|
||
|
ubidi_addPropertyStarts(&sa, &errorCode);
|
||
|
break;
|
||
|
case UPROPS_SRC_INPC:
|
||
|
case UPROPS_SRC_INSC:
|
||
|
case UPROPS_SRC_VO:
|
||
|
uprops_addPropertyStarts(src, &sa, &errorCode);
|
||
|
break;
|
||
|
case UPROPS_SRC_EMOJI: {
|
||
|
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
|
||
|
if (U_SUCCESS(errorCode)) {
|
||
|
ep->addPropertyStarts(&sa, errorCode);
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case UPROPS_SRC_IDSU:
|
||
|
// New in Unicode 15.1 for just two characters.
|
||
|
sa.add(sa.set, 0x2FFE);
|
||
|
sa.add(sa.set, 0x2FFF + 1);
|
||
|
break;
|
||
|
case UPROPS_SRC_ID_COMPAT_MATH:
|
||
|
uprops_addPropertyStarts(src, &sa, &errorCode);
|
||
|
break;
|
||
|
default:
|
||
|
errorCode = U_INTERNAL_PROGRAM_ERROR;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (U_FAILURE(errorCode)) {
|
||
|
return;
|
||
|
}
|
||
|
if (incl->isBogus()) {
|
||
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||
|
return;
|
||
|
}
|
||
|
// Compact for caching.
|
||
|
incl->compact();
|
||
|
gInclusions[src].fSet = incl.orphan();
|
||
|
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
|
||
|
}
|
||
|
|
||
|
const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
|
||
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
||
|
if (src < 0 || UPROPS_SRC_COUNT <= src) {
|
||
|
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||
|
return nullptr;
|
||
|
}
|
||
|
Inclusion &i = gInclusions[src];
|
||
|
umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
|
||
|
return i.fSet;
|
||
|
}
|
||
|
|
||
|
void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
|
||
|
// This function is invoked only via umtx_initOnce().
|
||
|
U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
|
||
|
int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
|
||
|
U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
|
||
|
UPropertySource src = uprops_getSource(prop);
|
||
|
const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
|
||
|
if (U_FAILURE(errorCode)) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
|
||
|
if (intPropIncl.isNull()) {
|
||
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||
|
return;
|
||
|
}
|
||
|
int32_t numRanges = incl->getRangeCount();
|
||
|
int32_t prevValue = 0;
|
||
|
for (int32_t i = 0; i < numRanges; ++i) {
|
||
|
UChar32 rangeEnd = incl->getRangeEnd(i);
|
||
|
for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
|
||
|
// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
|
||
|
int32_t value = u_getIntPropertyValue(c, prop);
|
||
|
if (value != prevValue) {
|
||
|
intPropIncl->add(c);
|
||
|
prevValue = value;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (intPropIncl->isBogus()) {
|
||
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||
|
return;
|
||
|
}
|
||
|
// Compact for caching.
|
||
|
intPropIncl->compact();
|
||
|
gInclusions[inclIndex].fSet = intPropIncl.orphan();
|
||
|
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
|
||
|
}
|
||
|
|
||
|
} // namespace
|
||
|
|
||
|
U_NAMESPACE_BEGIN
|
||
|
|
||
|
const UnicodeSet *CharacterProperties::getInclusionsForProperty(
|
||
|
UProperty prop, UErrorCode &errorCode) {
|
||
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
||
|
if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
|
||
|
int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
|
||
|
Inclusion &i = gInclusions[inclIndex];
|
||
|
umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
|
||
|
return i.fSet;
|
||
|
} else {
|
||
|
UPropertySource src = uprops_getSource(prop);
|
||
|
return getInclusionsForSource(src, errorCode);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
U_NAMESPACE_END
|
||
|
|
||
|
namespace {
|
||
|
|
||
|
UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
|
||
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
||
|
LocalPointer<UnicodeSet> set(new UnicodeSet());
|
||
|
if (set.isNull()) {
|
||
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||
|
return nullptr;
|
||
|
}
|
||
|
if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
|
||
|
// property of strings
|
||
|
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
|
||
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
||
|
USetAdder sa = {
|
||
|
(USet *)set.getAlias(),
|
||
|
_set_add,
|
||
|
_set_addRange,
|
||
|
_set_addString,
|
||
|
nullptr, // don't need remove()
|
||
|
nullptr // don't need removeRange()
|
||
|
};
|
||
|
ep->addStrings(&sa, property, errorCode);
|
||
|
if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
|
||
|
// property of _only_ strings
|
||
|
set->freeze();
|
||
|
return set.orphan();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const UnicodeSet *inclusions =
|
||
|
icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
|
||
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
||
|
int32_t numRanges = inclusions->getRangeCount();
|
||
|
UChar32 startHasProperty = -1;
|
||
|
|
||
|
for (int32_t i = 0; i < numRanges; ++i) {
|
||
|
UChar32 rangeEnd = inclusions->getRangeEnd(i);
|
||
|
for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
|
||
|
// TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
|
||
|
if (u_hasBinaryProperty(c, property)) {
|
||
|
if (startHasProperty < 0) {
|
||
|
// Transition from false to true.
|
||
|
startHasProperty = c;
|
||
|
}
|
||
|
} else if (startHasProperty >= 0) {
|
||
|
// Transition from true to false.
|
||
|
set->add(startHasProperty, c - 1);
|
||
|
startHasProperty = -1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if (startHasProperty >= 0) {
|
||
|
set->add(startHasProperty, 0x10FFFF);
|
||
|
}
|
||
|
set->freeze();
|
||
|
return set.orphan();
|
||
|
}
|
||
|
|
||
|
UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
|
||
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
||
|
uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
|
||
|
icu::LocalUMutableCPTriePointer mutableTrie(
|
||
|
umutablecptrie_open(nullValue, nullValue, &errorCode));
|
||
|
const UnicodeSet *inclusions =
|
||
|
icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
|
||
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
||
|
int32_t numRanges = inclusions->getRangeCount();
|
||
|
UChar32 start = 0;
|
||
|
uint32_t value = nullValue;
|
||
|
|
||
|
for (int32_t i = 0; i < numRanges; ++i) {
|
||
|
UChar32 rangeEnd = inclusions->getRangeEnd(i);
|
||
|
for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
|
||
|
// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
|
||
|
uint32_t nextValue = u_getIntPropertyValue(c, property);
|
||
|
if (value != nextValue) {
|
||
|
if (value != nullValue) {
|
||
|
umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
|
||
|
}
|
||
|
start = c;
|
||
|
value = nextValue;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if (value != 0) {
|
||
|
umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
|
||
|
}
|
||
|
|
||
|
UCPTrieType type;
|
||
|
if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
|
||
|
type = UCPTRIE_TYPE_FAST;
|
||
|
} else {
|
||
|
type = UCPTRIE_TYPE_SMALL;
|
||
|
}
|
||
|
UCPTrieValueWidth valueWidth;
|
||
|
// TODO: UCharacterProperty.IntProperty
|
||
|
int32_t max = u_getIntPropertyMaxValue(property);
|
||
|
if (max <= 0xff) {
|
||
|
valueWidth = UCPTRIE_VALUE_BITS_8;
|
||
|
} else if (max <= 0xffff) {
|
||
|
valueWidth = UCPTRIE_VALUE_BITS_16;
|
||
|
} else {
|
||
|
valueWidth = UCPTRIE_VALUE_BITS_32;
|
||
|
}
|
||
|
return reinterpret_cast<UCPMap *>(
|
||
|
umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
|
||
|
}
|
||
|
|
||
|
} // namespace
|
||
|
|
||
|
U_NAMESPACE_BEGIN
|
||
|
|
||
|
const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
|
||
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
||
|
if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
|
||
|
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||
|
return nullptr;
|
||
|
}
|
||
|
Mutex m(&cpMutex);
|
||
|
UnicodeSet *set = sets[property];
|
||
|
if (set == nullptr) {
|
||
|
sets[property] = set = makeSet(property, errorCode);
|
||
|
}
|
||
|
return set;
|
||
|
}
|
||
|
|
||
|
U_NAMESPACE_END
|
||
|
|
||
|
U_NAMESPACE_USE
|
||
|
|
||
|
U_CAPI const USet * U_EXPORT2
|
||
|
u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
|
||
|
const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
|
||
|
return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
|
||
|
}
|
||
|
|
||
|
U_CAPI const UCPMap * U_EXPORT2
|
||
|
u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
|
||
|
if (U_FAILURE(*pErrorCode)) { return nullptr; }
|
||
|
if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
|
||
|
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||
|
return nullptr;
|
||
|
}
|
||
|
Mutex m(&cpMutex);
|
||
|
UCPMap *map = maps[property - UCHAR_INT_START];
|
||
|
if (map == nullptr) {
|
||
|
maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
|
||
|
}
|
||
|
return map;
|
||
|
}
|