416 lines
17 KiB
C++
416 lines
17 KiB
C++
|
// © 2019 and later: Unicode, Inc. and others.
|
||
|
// License & terms of use: http://www.unicode.org/copyright.html
|
||
|
|
||
|
// locdistance.cpp
|
||
|
// created: 2019may08 Markus W. Scherer
|
||
|
|
||
|
#include "unicode/utypes.h"
|
||
|
#include "unicode/bytestrie.h"
|
||
|
#include "unicode/localematcher.h"
|
||
|
#include "unicode/locid.h"
|
||
|
#include "unicode/uobject.h"
|
||
|
#include "unicode/ures.h"
|
||
|
#include "cstring.h"
|
||
|
#include "locdistance.h"
|
||
|
#include "loclikelysubtags.h"
|
||
|
#include "uassert.h"
|
||
|
#include "ucln_cmn.h"
|
||
|
#include "uinvchar.h"
|
||
|
#include "umutex.h"
|
||
|
|
||
|
U_NAMESPACE_BEGIN
|
||
|
|
||
|
namespace {
|
||
|
|
||
|
/**
|
||
|
* Bit flag used on the last character of a subtag in the trie.
|
||
|
* Must be set consistently by the builder and the lookup code.
|
||
|
*/
|
||
|
constexpr int32_t END_OF_SUBTAG = 0x80;
|
||
|
/** Distance value bit flag, set by the builder. */
|
||
|
constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80;
|
||
|
/** Distance value bit flag, set by trieNext(). */
|
||
|
constexpr int32_t DISTANCE_IS_FINAL = 0x100;
|
||
|
constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT;
|
||
|
|
||
|
constexpr int32_t ABOVE_THRESHOLD = 100;
|
||
|
|
||
|
// Indexes into array of distances.
|
||
|
enum {
|
||
|
IX_DEF_LANG_DISTANCE,
|
||
|
IX_DEF_SCRIPT_DISTANCE,
|
||
|
IX_DEF_REGION_DISTANCE,
|
||
|
IX_MIN_REGION_DISTANCE,
|
||
|
IX_LIMIT
|
||
|
};
|
||
|
|
||
|
LocaleDistance *gLocaleDistance = nullptr;
|
||
|
UInitOnce gInitOnce {};
|
||
|
|
||
|
UBool U_CALLCONV cleanup() {
|
||
|
delete gLocaleDistance;
|
||
|
gLocaleDistance = nullptr;
|
||
|
gInitOnce.reset();
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
} // namespace
|
||
|
|
||
|
void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
|
||
|
// This function is invoked only via umtx_initOnce().
|
||
|
U_ASSERT(gLocaleDistance == nullptr);
|
||
|
const LikelySubtags &likely = *LikelySubtags::getSingleton(errorCode);
|
||
|
if (U_FAILURE(errorCode)) { return; }
|
||
|
const LocaleDistanceData &data = likely.getDistanceData();
|
||
|
if (data.distanceTrieBytes == nullptr ||
|
||
|
data.regionToPartitions == nullptr || data.partitions == nullptr ||
|
||
|
// ok if no paradigms
|
||
|
data.distances == nullptr) {
|
||
|
errorCode = U_MISSING_RESOURCE_ERROR;
|
||
|
return;
|
||
|
}
|
||
|
gLocaleDistance = new LocaleDistance(data, likely);
|
||
|
if (gLocaleDistance == nullptr) {
|
||
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||
|
return;
|
||
|
}
|
||
|
ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup);
|
||
|
}
|
||
|
|
||
|
const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
|
||
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
||
|
umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode);
|
||
|
return gLocaleDistance;
|
||
|
}
|
||
|
|
||
|
LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const LikelySubtags &likely) :
|
||
|
likelySubtags(likely),
|
||
|
trie(data.distanceTrieBytes),
|
||
|
regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
|
||
|
paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
|
||
|
defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]),
|
||
|
defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]),
|
||
|
defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]),
|
||
|
minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) {
|
||
|
// For the default demotion value, use the
|
||
|
// default region distance between unrelated Englishes.
|
||
|
// Thus, unless demotion is turned off,
|
||
|
// a mere region difference for one desired locale
|
||
|
// is as good as a perfect match for the next following desired locale.
|
||
|
// As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
|
||
|
LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR);
|
||
|
LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR);
|
||
|
const LSR *p_enGB = &enGB;
|
||
|
int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1,
|
||
|
shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY);
|
||
|
defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance);
|
||
|
}
|
||
|
|
||
|
int32_t LocaleDistance::getBestIndexAndDistance(
|
||
|
const LSR &desired,
|
||
|
const LSR **supportedLSRs, int32_t supportedLSRsLength,
|
||
|
int32_t shiftedThreshold,
|
||
|
ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const {
|
||
|
BytesTrie iter(trie);
|
||
|
// Look up the desired language only once for all supported LSRs.
|
||
|
// Its "distance" is either a match point value of 0, or a non-match negative value.
|
||
|
// Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
|
||
|
int32_t desLangDistance = trieNext(iter, desired.language, false);
|
||
|
uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
|
||
|
// Index of the supported LSR with the lowest distance.
|
||
|
int32_t bestIndex = -1;
|
||
|
// Cached lookup info from LikelySubtags.compareLikely().
|
||
|
int32_t bestLikelyInfo = -1;
|
||
|
for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
|
||
|
const LSR &supported = *supportedLSRs[slIndex];
|
||
|
bool star = false;
|
||
|
int32_t distance = desLangDistance;
|
||
|
if (distance >= 0) {
|
||
|
U_ASSERT((distance & DISTANCE_IS_FINAL) == 0);
|
||
|
if (slIndex != 0) {
|
||
|
iter.resetToState64(desLangState);
|
||
|
}
|
||
|
distance = trieNext(iter, supported.language, true);
|
||
|
}
|
||
|
// Note: The data builder verifies that there are no rules with "any" (*) language and
|
||
|
// real (non *) script or region subtags.
|
||
|
// This means that if the lookup for either language fails we can use
|
||
|
// the default distances without further lookups.
|
||
|
int32_t flags;
|
||
|
if (distance >= 0) {
|
||
|
flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
|
||
|
distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
|
||
|
} else { // <*, *>
|
||
|
if (uprv_strcmp(desired.language, supported.language) == 0) {
|
||
|
distance = 0;
|
||
|
} else {
|
||
|
distance = defaultLanguageDistance;
|
||
|
}
|
||
|
flags = 0;
|
||
|
star = true;
|
||
|
}
|
||
|
U_ASSERT(0 <= distance && distance <= 100);
|
||
|
// Round up the shifted threshold (if fraction bits are not 0)
|
||
|
// for comparison with un-shifted distances until we need fraction bits.
|
||
|
// (If we simply shifted non-zero fraction bits away, then we might ignore a language
|
||
|
// when it's really still a micro distance below the threshold.)
|
||
|
int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT;
|
||
|
// We implement "favor subtag" by reducing the language subtag distance
|
||
|
// (unscientifically reducing it to a quarter of the normal value),
|
||
|
// so that the script distance is relatively more important.
|
||
|
// For example, given a default language distance of 80, we reduce it to 20,
|
||
|
// which is below the default threshold of 50, which is the default script distance.
|
||
|
if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) {
|
||
|
distance >>= 2;
|
||
|
}
|
||
|
// Let distance == roundedThreshold pass until the tie-breaker logic
|
||
|
// at the end of the loop.
|
||
|
if (distance > roundedThreshold) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
int32_t scriptDistance;
|
||
|
if (star || flags != 0) {
|
||
|
if (uprv_strcmp(desired.script, supported.script) == 0) {
|
||
|
scriptDistance = 0;
|
||
|
} else {
|
||
|
scriptDistance = defaultScriptDistance;
|
||
|
}
|
||
|
} else {
|
||
|
scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(),
|
||
|
desired.script, supported.script);
|
||
|
flags = scriptDistance & DISTANCE_IS_FINAL;
|
||
|
scriptDistance &= ~DISTANCE_IS_FINAL;
|
||
|
}
|
||
|
distance += scriptDistance;
|
||
|
if (distance > roundedThreshold) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if (uprv_strcmp(desired.region, supported.region) == 0) {
|
||
|
// regionDistance = 0
|
||
|
} else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
|
||
|
distance += defaultRegionDistance;
|
||
|
} else {
|
||
|
int32_t remainingThreshold = roundedThreshold - distance;
|
||
|
if (minRegionDistance > remainingThreshold) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// From here on we know the regions are not equal.
|
||
|
// Map each region to zero or more partitions. (zero = one non-matching string)
|
||
|
// (Each array of single-character partition strings is encoded as one string.)
|
||
|
// If either side has more than one, then we find the maximum distance.
|
||
|
// This could be optimized by adding some more structure, but probably not worth it.
|
||
|
distance += getRegionPartitionsDistance(
|
||
|
iter, iter.getState64(),
|
||
|
partitionsForRegion(desired),
|
||
|
partitionsForRegion(supported),
|
||
|
remainingThreshold);
|
||
|
}
|
||
|
int32_t shiftedDistance = shiftDistance(distance);
|
||
|
if (shiftedDistance == 0) {
|
||
|
// Distinguish between equivalent but originally unequal locales via an
|
||
|
// additional micro distance.
|
||
|
shiftedDistance |= (desired.flags ^ supported.flags);
|
||
|
if (shiftedDistance < shiftedThreshold) {
|
||
|
if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
|
||
|
// Is there also a match when we swap desired/supported?
|
||
|
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
|
||
|
if (shiftedDistance == 0) {
|
||
|
return slIndex << INDEX_SHIFT;
|
||
|
}
|
||
|
bestIndex = slIndex;
|
||
|
shiftedThreshold = shiftedDistance;
|
||
|
bestLikelyInfo = -1;
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
if (shiftedDistance < shiftedThreshold) {
|
||
|
if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
|
||
|
// Is there also a match when we swap desired/supported?
|
||
|
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
|
||
|
bestIndex = slIndex;
|
||
|
shiftedThreshold = shiftedDistance;
|
||
|
bestLikelyInfo = -1;
|
||
|
}
|
||
|
} else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
|
||
|
if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
|
||
|
// Is there also a match when we swap desired/supported?
|
||
|
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
|
||
|
bestLikelyInfo = likelySubtags.compareLikely(
|
||
|
supported, *supportedLSRs[bestIndex], bestLikelyInfo);
|
||
|
if ((bestLikelyInfo & 1) != 0) {
|
||
|
// This supported locale matches as well as the previous best match,
|
||
|
// and neither matches perfectly,
|
||
|
// but this one is "more likely" (has more-default subtags).
|
||
|
bestIndex = slIndex;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return bestIndex >= 0 ?
|
||
|
(bestIndex << INDEX_SHIFT) | shiftedThreshold :
|
||
|
INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
|
||
|
}
|
||
|
|
||
|
int32_t LocaleDistance::getDesSuppScriptDistance(
|
||
|
BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) {
|
||
|
// Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
|
||
|
int32_t distance = trieNext(iter, desired, false);
|
||
|
if (distance >= 0) {
|
||
|
distance = trieNext(iter, supported, true);
|
||
|
}
|
||
|
if (distance < 0) {
|
||
|
UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *>
|
||
|
U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
|
||
|
if (uprv_strcmp(desired, supported) == 0) {
|
||
|
distance = 0; // same script
|
||
|
} else {
|
||
|
distance = iter.getValue();
|
||
|
U_ASSERT(distance >= 0);
|
||
|
}
|
||
|
if (result == USTRINGTRIE_FINAL_VALUE) {
|
||
|
distance |= DISTANCE_IS_FINAL;
|
||
|
}
|
||
|
}
|
||
|
return distance;
|
||
|
}
|
||
|
|
||
|
int32_t LocaleDistance::getRegionPartitionsDistance(
|
||
|
BytesTrie &iter, uint64_t startState,
|
||
|
const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) {
|
||
|
char desired = *desiredPartitions++;
|
||
|
char supported = *supportedPartitions++;
|
||
|
U_ASSERT(desired != 0 && supported != 0);
|
||
|
// See if we have single desired/supported partitions, from NUL-terminated
|
||
|
// partition strings without explicit length.
|
||
|
bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character
|
||
|
// equivalent to: if (desLength == 1 && suppLength == 1)
|
||
|
if (*desiredPartitions == 0 && !suppLengthGt1) {
|
||
|
// Fastpath for single desired/supported partitions.
|
||
|
UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
|
||
|
if (USTRINGTRIE_HAS_NEXT(result)) {
|
||
|
result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
|
||
|
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||
|
return iter.getValue();
|
||
|
}
|
||
|
}
|
||
|
return getFallbackRegionDistance(iter, startState);
|
||
|
}
|
||
|
|
||
|
const char *supportedStart = supportedPartitions - 1; // for restart of inner loop
|
||
|
int32_t regionDistance = 0;
|
||
|
// Fall back to * only once, not for each pair of partition strings.
|
||
|
bool star = false;
|
||
|
for (;;) {
|
||
|
// Look up each desired-partition string only once,
|
||
|
// not for each (desired, supported) pair.
|
||
|
UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
|
||
|
if (USTRINGTRIE_HAS_NEXT(result)) {
|
||
|
uint64_t desState = suppLengthGt1 ? iter.getState64() : 0;
|
||
|
for (;;) {
|
||
|
result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
|
||
|
int32_t d;
|
||
|
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||
|
d = iter.getValue();
|
||
|
} else if (star) {
|
||
|
d = 0;
|
||
|
} else {
|
||
|
d = getFallbackRegionDistance(iter, startState);
|
||
|
star = true;
|
||
|
}
|
||
|
if (d > threshold) {
|
||
|
return d;
|
||
|
} else if (regionDistance < d) {
|
||
|
regionDistance = d;
|
||
|
}
|
||
|
if ((supported = *supportedPartitions++) != 0) {
|
||
|
iter.resetToState64(desState);
|
||
|
} else {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
} else if (!star) {
|
||
|
int32_t d = getFallbackRegionDistance(iter, startState);
|
||
|
if (d > threshold) {
|
||
|
return d;
|
||
|
} else if (regionDistance < d) {
|
||
|
regionDistance = d;
|
||
|
}
|
||
|
star = true;
|
||
|
}
|
||
|
if ((desired = *desiredPartitions++) != 0) {
|
||
|
iter.resetToState64(startState);
|
||
|
supportedPartitions = supportedStart;
|
||
|
supported = *supportedPartitions++;
|
||
|
} else {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
return regionDistance;
|
||
|
}
|
||
|
|
||
|
int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) {
|
||
|
#if U_DEBUG
|
||
|
UStringTrieResult result =
|
||
|
#endif
|
||
|
iter.resetToState64(startState).next(u'*'); // <*, *>
|
||
|
U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
|
||
|
int32_t distance = iter.getValue();
|
||
|
U_ASSERT(distance >= 0);
|
||
|
return distance;
|
||
|
}
|
||
|
|
||
|
int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) {
|
||
|
uint8_t c;
|
||
|
if ((c = *s) == 0) {
|
||
|
return -1; // no empty subtags in the distance data
|
||
|
}
|
||
|
for (;;) {
|
||
|
c = uprv_invCharToAscii(c);
|
||
|
// EBCDIC: If *s is not an invariant character,
|
||
|
// then c is now 0 and will simply not match anything, which is harmless.
|
||
|
uint8_t next = *++s;
|
||
|
if (next != 0) {
|
||
|
if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
|
||
|
return -1;
|
||
|
}
|
||
|
} else {
|
||
|
// last character of this subtag
|
||
|
UStringTrieResult result = iter.next(c | END_OF_SUBTAG);
|
||
|
if (wantValue) {
|
||
|
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||
|
int32_t value = iter.getValue();
|
||
|
if (result == USTRINGTRIE_FINAL_VALUE) {
|
||
|
value |= DISTANCE_IS_FINAL;
|
||
|
}
|
||
|
return value;
|
||
|
}
|
||
|
} else {
|
||
|
if (USTRINGTRIE_HAS_NEXT(result)) {
|
||
|
return 0;
|
||
|
}
|
||
|
}
|
||
|
return -1;
|
||
|
}
|
||
|
c = next;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bool LocaleDistance::isParadigmLSR(const LSR &lsr) const {
|
||
|
// Linear search for a very short list (length 6 as of 2019),
|
||
|
// because we look for equivalence not equality, and
|
||
|
// because it's easy.
|
||
|
// If there are many paradigm LSRs we should use a hash set
|
||
|
// with custom comparator and hasher.
|
||
|
U_ASSERT(paradigmLSRsLength <= 15);
|
||
|
for (int32_t i = 0; i < paradigmLSRsLength; ++i) {
|
||
|
if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; }
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
U_NAMESPACE_END
|