1523 lines
60 KiB
C++
1523 lines
60 KiB
C++
|
// © 2016 and later: Unicode, Inc. and others.
|
||
|
// License & terms of use: http://www.unicode.org/copyright.html
|
||
|
/*
|
||
|
******************************************************************************
|
||
|
*
|
||
|
* Copyright (C) 2007-2012, International Business Machines
|
||
|
* Corporation and others. All Rights Reserved.
|
||
|
*
|
||
|
******************************************************************************
|
||
|
* file name: unisetspan.cpp
|
||
|
* encoding: UTF-8
|
||
|
* tab size: 8 (not used)
|
||
|
* indentation:4
|
||
|
*
|
||
|
* created on: 2007mar01
|
||
|
* created by: Markus W. Scherer
|
||
|
*/
|
||
|
|
||
|
#include "unicode/utypes.h"
|
||
|
#include "unicode/uniset.h"
|
||
|
#include "unicode/ustring.h"
|
||
|
#include "unicode/utf8.h"
|
||
|
#include "unicode/utf16.h"
|
||
|
#include "cmemory.h"
|
||
|
#include "uvector.h"
|
||
|
#include "unisetspan.h"
|
||
|
|
||
|
U_NAMESPACE_BEGIN
|
||
|
|
||
|
/*
|
||
|
* List of offsets from the current position from where to try matching
|
||
|
* a code point or a string.
|
||
|
* Store offsets rather than indexes to simplify the code and use the same list
|
||
|
* for both increments (in span()) and decrements (in spanBack()).
|
||
|
*
|
||
|
* Assumption: The maximum offset is limited, and the offsets that are stored
|
||
|
* at any one time are relatively dense, that is, there are normally no gaps of
|
||
|
* hundreds or thousands of offset values.
|
||
|
*
|
||
|
* The implementation uses a circular buffer of byte flags,
|
||
|
* each indicating whether the corresponding offset is in the list.
|
||
|
* This avoids inserting into a sorted list of offsets (or absolute indexes) and
|
||
|
* physically moving part of the list.
|
||
|
*
|
||
|
* Note: In principle, the caller should setMaxLength() to the maximum of the
|
||
|
* max string length and U16_LENGTH/U8_LENGTH to account for
|
||
|
* "long" single code points.
|
||
|
* However, this implementation uses at least a staticList with more than
|
||
|
* U8_LENGTH entries anyway.
|
||
|
*
|
||
|
* Note: If maxLength were guaranteed to be no more than 32 or 64,
|
||
|
* the list could be stored as bit flags in a single integer.
|
||
|
* Rather than handling a circular buffer with a start list index,
|
||
|
* the integer would simply be shifted when lower offsets are removed.
|
||
|
* UnicodeSet does not have a limit on the lengths of strings.
|
||
|
*/
|
||
|
class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
|
||
|
public:
|
||
|
OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
|
||
|
|
||
|
~OffsetList() {
|
||
|
if(list!=staticList) {
|
||
|
uprv_free(list);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Call exactly once if the list is to be used.
|
||
|
void setMaxLength(int32_t maxLength) {
|
||
|
if(maxLength<=(int32_t)sizeof(staticList)) {
|
||
|
capacity=(int32_t)sizeof(staticList);
|
||
|
} else {
|
||
|
UBool *l=(UBool *)uprv_malloc(maxLength);
|
||
|
if(l!=nullptr) {
|
||
|
list=l;
|
||
|
capacity=maxLength;
|
||
|
}
|
||
|
}
|
||
|
uprv_memset(list, 0, capacity);
|
||
|
}
|
||
|
|
||
|
void clear() {
|
||
|
uprv_memset(list, 0, capacity);
|
||
|
start=length=0;
|
||
|
}
|
||
|
|
||
|
UBool isEmpty() const {
|
||
|
return (UBool)(length==0);
|
||
|
}
|
||
|
|
||
|
// Reduce all stored offsets by delta, used when the current position
|
||
|
// moves by delta.
|
||
|
// There must not be any offsets lower than delta.
|
||
|
// If there is an offset equal to delta, it is removed.
|
||
|
// delta=[1..maxLength]
|
||
|
void shift(int32_t delta) {
|
||
|
int32_t i=start+delta;
|
||
|
if(i>=capacity) {
|
||
|
i-=capacity;
|
||
|
}
|
||
|
if(list[i]) {
|
||
|
list[i]=false;
|
||
|
--length;
|
||
|
}
|
||
|
start=i;
|
||
|
}
|
||
|
|
||
|
// Add an offset. The list must not contain it yet.
|
||
|
// offset=[1..maxLength]
|
||
|
void addOffset(int32_t offset) {
|
||
|
int32_t i=start+offset;
|
||
|
if(i>=capacity) {
|
||
|
i-=capacity;
|
||
|
}
|
||
|
list[i]=true;
|
||
|
++length;
|
||
|
}
|
||
|
|
||
|
// offset=[1..maxLength]
|
||
|
UBool containsOffset(int32_t offset) const {
|
||
|
int32_t i=start+offset;
|
||
|
if(i>=capacity) {
|
||
|
i-=capacity;
|
||
|
}
|
||
|
return list[i];
|
||
|
}
|
||
|
|
||
|
// Find the lowest stored offset from a non-empty list, remove it,
|
||
|
// and reduce all other offsets by this minimum.
|
||
|
// Returns [1..maxLength].
|
||
|
int32_t popMinimum() {
|
||
|
// Look for the next offset in list[start+1..capacity-1].
|
||
|
int32_t i=start, result;
|
||
|
while(++i<capacity) {
|
||
|
if(list[i]) {
|
||
|
list[i]=false;
|
||
|
--length;
|
||
|
result=i-start;
|
||
|
start=i;
|
||
|
return result;
|
||
|
}
|
||
|
}
|
||
|
// i==capacity
|
||
|
|
||
|
// Wrap around and look for the next offset in list[0..start].
|
||
|
// Since the list is not empty, there will be one.
|
||
|
result=capacity-start;
|
||
|
i=0;
|
||
|
while(!list[i]) {
|
||
|
++i;
|
||
|
}
|
||
|
list[i]=false;
|
||
|
--length;
|
||
|
start=i;
|
||
|
return result+=i;
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
UBool *list;
|
||
|
int32_t capacity;
|
||
|
int32_t length;
|
||
|
int32_t start;
|
||
|
|
||
|
UBool staticList[16];
|
||
|
};
|
||
|
|
||
|
// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
|
||
|
static int32_t
|
||
|
getUTF8Length(const char16_t *s, int32_t length) {
|
||
|
UErrorCode errorCode=U_ZERO_ERROR;
|
||
|
int32_t length8=0;
|
||
|
u_strToUTF8(nullptr, 0, &length8, s, length, &errorCode);
|
||
|
if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||
|
return length8;
|
||
|
} else {
|
||
|
// The string contains an unpaired surrogate.
|
||
|
// Ignore this string.
|
||
|
return 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
|
||
|
static int32_t
|
||
|
appendUTF8(const char16_t *s, int32_t length, uint8_t *t, int32_t capacity) {
|
||
|
UErrorCode errorCode=U_ZERO_ERROR;
|
||
|
int32_t length8=0;
|
||
|
u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
|
||
|
if(U_SUCCESS(errorCode)) {
|
||
|
return length8;
|
||
|
} else {
|
||
|
// The string contains an unpaired surrogate.
|
||
|
// Ignore this string.
|
||
|
return 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static inline uint8_t
|
||
|
makeSpanLengthByte(int32_t spanLength) {
|
||
|
// 0xfe==UnicodeSetStringSpan::LONG_SPAN
|
||
|
return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
|
||
|
}
|
||
|
|
||
|
// Construct for all variants of span(), or only for any one variant.
|
||
|
// Initialize as little as possible, for single use.
|
||
|
UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
|
||
|
const UVector &setStrings,
|
||
|
uint32_t which)
|
||
|
: spanSet(0, 0x10ffff), pSpanNotSet(nullptr), strings(setStrings),
|
||
|
utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
|
||
|
utf8Length(0),
|
||
|
maxLength16(0), maxLength8(0),
|
||
|
all((UBool)(which==ALL)) {
|
||
|
spanSet.retainAll(set);
|
||
|
if(which&NOT_CONTAINED) {
|
||
|
// Default to the same sets.
|
||
|
// addToSpanNotSet() will create a separate set if necessary.
|
||
|
pSpanNotSet=&spanSet;
|
||
|
}
|
||
|
|
||
|
// Determine if the strings even need to be taken into account at all for span() etc.
|
||
|
// If any string is relevant, then all strings need to be used for
|
||
|
// span(longest match) but only the relevant ones for span(while contained).
|
||
|
// TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
|
||
|
// and do not store UTF-8 strings if !thisRelevant and CONTAINED.
|
||
|
// (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
|
||
|
// Also count the lengths of the UTF-8 versions of the strings for memory allocation.
|
||
|
int32_t stringsLength=strings.size();
|
||
|
|
||
|
int32_t i, spanLength;
|
||
|
UBool someRelevant=false;
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
|
||
|
const char16_t *s16=string.getBuffer();
|
||
|
int32_t length16=string.length();
|
||
|
if (length16==0) {
|
||
|
continue; // skip the empty string
|
||
|
}
|
||
|
UBool thisRelevant;
|
||
|
spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
|
||
|
if(spanLength<length16) { // Relevant string.
|
||
|
someRelevant=thisRelevant=true;
|
||
|
} else {
|
||
|
thisRelevant=false;
|
||
|
}
|
||
|
if((which&UTF16) && length16>maxLength16) {
|
||
|
maxLength16=length16;
|
||
|
}
|
||
|
if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
|
||
|
int32_t length8=getUTF8Length(s16, length16);
|
||
|
utf8Length+=length8;
|
||
|
if(length8>maxLength8) {
|
||
|
maxLength8=length8;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if(!someRelevant) {
|
||
|
maxLength16=maxLength8=0;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// Freeze after checking for the need to use strings at all because freezing
|
||
|
// a set takes some time and memory which are wasted if there are no relevant strings.
|
||
|
if(all) {
|
||
|
spanSet.freeze();
|
||
|
}
|
||
|
|
||
|
uint8_t *spanBackLengths;
|
||
|
uint8_t *spanUTF8Lengths;
|
||
|
uint8_t *spanBackUTF8Lengths;
|
||
|
|
||
|
// Allocate a block of meta data.
|
||
|
int32_t allocSize;
|
||
|
if(all) {
|
||
|
// UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
|
||
|
allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
|
||
|
} else {
|
||
|
allocSize=stringsLength; // One set of span lengths.
|
||
|
if(which&UTF8) {
|
||
|
// UTF-8 lengths and UTF-8 strings.
|
||
|
allocSize+=stringsLength*4+utf8Length;
|
||
|
}
|
||
|
}
|
||
|
if(allocSize<=(int32_t)sizeof(staticLengths)) {
|
||
|
utf8Lengths=staticLengths;
|
||
|
} else {
|
||
|
utf8Lengths=(int32_t *)uprv_malloc(allocSize);
|
||
|
if(utf8Lengths==nullptr) {
|
||
|
maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return false.
|
||
|
return; // Out of memory.
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(all) {
|
||
|
// Store span lengths for all span() variants.
|
||
|
spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
|
||
|
spanBackLengths=spanLengths+stringsLength;
|
||
|
spanUTF8Lengths=spanBackLengths+stringsLength;
|
||
|
spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
|
||
|
utf8=spanBackUTF8Lengths+stringsLength;
|
||
|
} else {
|
||
|
// Store span lengths for only one span() variant.
|
||
|
if(which&UTF8) {
|
||
|
spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
|
||
|
utf8=spanLengths+stringsLength;
|
||
|
} else {
|
||
|
spanLengths=(uint8_t *)utf8Lengths;
|
||
|
}
|
||
|
spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
|
||
|
}
|
||
|
|
||
|
// Set the meta data and pSpanNotSet and write the UTF-8 strings.
|
||
|
int32_t utf8Count=0; // Count UTF-8 bytes written so far.
|
||
|
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
|
||
|
const char16_t *s16=string.getBuffer();
|
||
|
int32_t length16=string.length();
|
||
|
spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
|
||
|
if(spanLength<length16 && length16>0) { // Relevant string.
|
||
|
if(which&UTF16) {
|
||
|
if(which&CONTAINED) {
|
||
|
if(which&FWD) {
|
||
|
spanLengths[i]=makeSpanLengthByte(spanLength);
|
||
|
}
|
||
|
if(which&BACK) {
|
||
|
spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
|
||
|
spanBackLengths[i]=makeSpanLengthByte(spanLength);
|
||
|
}
|
||
|
} else /* not CONTAINED, not all, but NOT_CONTAINED */ {
|
||
|
spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
|
||
|
}
|
||
|
}
|
||
|
if(which&UTF8) {
|
||
|
uint8_t *s8=utf8+utf8Count;
|
||
|
int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
|
||
|
utf8Count+=utf8Lengths[i]=length8;
|
||
|
if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
|
||
|
spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
|
||
|
} else { // Relevant for UTF-8.
|
||
|
if(which&CONTAINED) {
|
||
|
if(which&FWD) {
|
||
|
spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
|
||
|
spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
|
||
|
}
|
||
|
if(which&BACK) {
|
||
|
spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
|
||
|
spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
|
||
|
}
|
||
|
} else /* not CONTAINED, not all, but NOT_CONTAINED */ {
|
||
|
spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if(which&NOT_CONTAINED) {
|
||
|
// Add string start and end code points to the spanNotSet so that
|
||
|
// a span(while not contained) stops before any string.
|
||
|
UChar32 c;
|
||
|
if(which&FWD) {
|
||
|
int32_t len=0;
|
||
|
U16_NEXT(s16, len, length16, c);
|
||
|
addToSpanNotSet(c);
|
||
|
}
|
||
|
if(which&BACK) {
|
||
|
int32_t len=length16;
|
||
|
U16_PREV(s16, 0, len, c);
|
||
|
addToSpanNotSet(c);
|
||
|
}
|
||
|
}
|
||
|
} else { // Irrelevant string. (Also the empty string.)
|
||
|
if(which&UTF8) {
|
||
|
if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
|
||
|
uint8_t *s8=utf8+utf8Count;
|
||
|
int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
|
||
|
utf8Count+=utf8Lengths[i]=length8;
|
||
|
} else {
|
||
|
utf8Lengths[i]=0;
|
||
|
}
|
||
|
}
|
||
|
if(all) {
|
||
|
spanLengths[i]=spanBackLengths[i]=
|
||
|
spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
|
||
|
(uint8_t)ALL_CP_CONTAINED;
|
||
|
} else {
|
||
|
// All spanXYZLengths pointers contain the same address.
|
||
|
spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Finish.
|
||
|
if(all) {
|
||
|
pSpanNotSet->freeze();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Copy constructor. Assumes which==ALL for a frozen set.
|
||
|
UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
|
||
|
const UVector &newParentSetStrings)
|
||
|
: spanSet(otherStringSpan.spanSet), pSpanNotSet(nullptr), strings(newParentSetStrings),
|
||
|
utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
|
||
|
utf8Length(otherStringSpan.utf8Length),
|
||
|
maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
|
||
|
all(true) {
|
||
|
if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
|
||
|
pSpanNotSet=&spanSet;
|
||
|
} else {
|
||
|
pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
|
||
|
}
|
||
|
|
||
|
// Allocate a block of meta data.
|
||
|
// UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
|
||
|
int32_t stringsLength=strings.size();
|
||
|
int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
|
||
|
if(allocSize<=(int32_t)sizeof(staticLengths)) {
|
||
|
utf8Lengths=staticLengths;
|
||
|
} else {
|
||
|
utf8Lengths=(int32_t *)uprv_malloc(allocSize);
|
||
|
if(utf8Lengths==nullptr) {
|
||
|
maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return false.
|
||
|
return; // Out of memory.
|
||
|
}
|
||
|
}
|
||
|
|
||
|
spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
|
||
|
utf8=spanLengths+stringsLength*4;
|
||
|
uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
|
||
|
}
|
||
|
|
||
|
UnicodeSetStringSpan::~UnicodeSetStringSpan() {
|
||
|
if(pSpanNotSet!=nullptr && pSpanNotSet!=&spanSet) {
|
||
|
delete pSpanNotSet;
|
||
|
}
|
||
|
if(utf8Lengths!=nullptr && utf8Lengths!=staticLengths) {
|
||
|
uprv_free(utf8Lengths);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
|
||
|
if(pSpanNotSet==nullptr || pSpanNotSet==&spanSet) {
|
||
|
if(spanSet.contains(c)) {
|
||
|
return; // Nothing to do.
|
||
|
}
|
||
|
UnicodeSet *newSet=spanSet.cloneAsThawed();
|
||
|
if(newSet==nullptr) {
|
||
|
return; // Out of memory.
|
||
|
} else {
|
||
|
pSpanNotSet=newSet;
|
||
|
}
|
||
|
}
|
||
|
pSpanNotSet->add(c);
|
||
|
}
|
||
|
|
||
|
// Compare strings without any argument checks. Requires length>0.
|
||
|
static inline UBool
|
||
|
matches16(const char16_t *s, const char16_t *t, int32_t length) {
|
||
|
do {
|
||
|
if(*s++!=*t++) {
|
||
|
return false;
|
||
|
}
|
||
|
} while(--length>0);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
static inline UBool
|
||
|
matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
|
||
|
do {
|
||
|
if(*s++!=*t++) {
|
||
|
return false;
|
||
|
}
|
||
|
} while(--length>0);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// Compare 16-bit Unicode strings (which may be malformed UTF-16)
|
||
|
// at code point boundaries.
|
||
|
// That is, each edge of a match must not be in the middle of a surrogate pair.
|
||
|
static inline UBool
|
||
|
matches16CPB(const char16_t *s, int32_t start, int32_t limit, const char16_t *t, int32_t length) {
|
||
|
s+=start;
|
||
|
limit-=start;
|
||
|
return matches16(s, t, length) &&
|
||
|
!(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
|
||
|
!(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
|
||
|
}
|
||
|
|
||
|
// Does the set contain the next code point?
|
||
|
// If so, return its length; otherwise return its negative length.
|
||
|
static inline int32_t
|
||
|
spanOne(const UnicodeSet &set, const char16_t *s, int32_t length) {
|
||
|
char16_t c=*s, c2;
|
||
|
if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
|
||
|
return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
|
||
|
}
|
||
|
return set.contains(c) ? 1 : -1;
|
||
|
}
|
||
|
|
||
|
static inline int32_t
|
||
|
spanOneBack(const UnicodeSet &set, const char16_t *s, int32_t length) {
|
||
|
char16_t c=s[length-1], c2;
|
||
|
if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
|
||
|
return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
|
||
|
}
|
||
|
return set.contains(c) ? 1 : -1;
|
||
|
}
|
||
|
|
||
|
static inline int32_t
|
||
|
spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
|
||
|
UChar32 c=*s;
|
||
|
if(U8_IS_SINGLE(c)) {
|
||
|
return set.contains(c) ? 1 : -1;
|
||
|
}
|
||
|
// Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
|
||
|
int32_t i=0;
|
||
|
U8_NEXT_OR_FFFD(s, i, length, c);
|
||
|
return set.contains(c) ? i : -i;
|
||
|
}
|
||
|
|
||
|
static inline int32_t
|
||
|
spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
|
||
|
UChar32 c=s[length-1];
|
||
|
if(U8_IS_SINGLE(c)) {
|
||
|
return set.contains(c) ? 1 : -1;
|
||
|
}
|
||
|
int32_t i=length-1;
|
||
|
c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
|
||
|
length-=i;
|
||
|
return set.contains(c) ? length : -length;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Note: In span() when spanLength==0 (after a string match, or at the beginning
|
||
|
* after an empty code point span) and in spanNot() and spanNotUTF8(),
|
||
|
* string matching could use a binary search
|
||
|
* because all string matches are done from the same start index.
|
||
|
*
|
||
|
* For UTF-8, this would require a comparison function that returns UTF-16 order.
|
||
|
*
|
||
|
* This optimization should not be necessary for normal UnicodeSets because
|
||
|
* most sets have no strings, and most sets with strings have
|
||
|
* very few very short strings.
|
||
|
* For cases with many strings, it might be better to use a different API
|
||
|
* and implementation with a DFA (state machine).
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* Algorithm for span(USET_SPAN_CONTAINED)
|
||
|
*
|
||
|
* Theoretical algorithm:
|
||
|
* - Iterate through the string, and at each code point boundary:
|
||
|
* + If the code point there is in the set, then remember to continue after it.
|
||
|
* + If a set string matches at the current position, then remember to continue after it.
|
||
|
* + Either recursively span for each code point or string match,
|
||
|
* or recursively span for all but the shortest one and
|
||
|
* iteratively continue the span with the shortest local match.
|
||
|
* + Remember the longest recursive span (the farthest end point).
|
||
|
* + If there is no match at the current position, neither for the code point there
|
||
|
* nor for any set string, then stop and return the longest recursive span length.
|
||
|
*
|
||
|
* Optimized implementation:
|
||
|
*
|
||
|
* (We assume that most sets will have very few very short strings.
|
||
|
* A span using a string-less set is extremely fast.)
|
||
|
*
|
||
|
* Create and cache a spanSet which contains all of the single code points
|
||
|
* of the original set but none of its strings.
|
||
|
*
|
||
|
* - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
|
||
|
* - Loop:
|
||
|
* + Try to match each set string at the end of the spanLength.
|
||
|
* ~ Set strings that start with set-contained code points must be matched
|
||
|
* with a partial overlap because the recursive algorithm would have tried
|
||
|
* to match them at every position.
|
||
|
* ~ Set strings that entirely consist of set-contained code points
|
||
|
* are irrelevant for span(USET_SPAN_CONTAINED) because the
|
||
|
* recursive algorithm would continue after them anyway
|
||
|
* and find the longest recursive match from their end.
|
||
|
* ~ Rather than recursing, note each end point of a set string match.
|
||
|
* + If no set string matched after spanSet.span(), then return
|
||
|
* with where the spanSet.span() ended.
|
||
|
* + If at least one set string matched after spanSet.span(), then
|
||
|
* pop the shortest string match end point and continue
|
||
|
* the loop, trying to match all set strings from there.
|
||
|
* + If at least one more set string matched after a previous string match,
|
||
|
* then test if the code point after the previous string match is also
|
||
|
* contained in the set.
|
||
|
* Continue the loop with the shortest end point of either this code point
|
||
|
* or a matching set string.
|
||
|
* + If no more set string matched after a previous string match,
|
||
|
* then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
|
||
|
* Stop if spanLength==0, otherwise continue the loop.
|
||
|
*
|
||
|
* By noting each end point of a set string match,
|
||
|
* the function visits each string position at most once and finishes
|
||
|
* in linear time.
|
||
|
*
|
||
|
* The recursive algorithm may visit the same string position many times
|
||
|
* if multiple paths lead to it and finishes in exponential time.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* Algorithm for span(USET_SPAN_SIMPLE)
|
||
|
*
|
||
|
* Theoretical algorithm:
|
||
|
* - Iterate through the string, and at each code point boundary:
|
||
|
* + If the code point there is in the set, then remember to continue after it.
|
||
|
* + If a set string matches at the current position, then remember to continue after it.
|
||
|
* + Continue from the farthest match position and ignore all others.
|
||
|
* + If there is no match at the current position,
|
||
|
* then stop and return the current position.
|
||
|
*
|
||
|
* Optimized implementation:
|
||
|
*
|
||
|
* (Same assumption and spanSet as above.)
|
||
|
*
|
||
|
* - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
|
||
|
* - Loop:
|
||
|
* + Try to match each set string at the end of the spanLength.
|
||
|
* ~ Set strings that start with set-contained code points must be matched
|
||
|
* with a partial overlap because the standard algorithm would have tried
|
||
|
* to match them earlier.
|
||
|
* ~ Set strings that entirely consist of set-contained code points
|
||
|
* must be matched with a full overlap because the longest-match algorithm
|
||
|
* would hide set string matches that end earlier.
|
||
|
* Such set strings need not be matched earlier inside the code point span
|
||
|
* because the standard algorithm would then have continued after
|
||
|
* the set string match anyway.
|
||
|
* ~ Remember the longest set string match (farthest end point) from the earliest
|
||
|
* starting point.
|
||
|
* + If no set string matched after spanSet.span(), then return
|
||
|
* with where the spanSet.span() ended.
|
||
|
* + If at least one set string matched, then continue the loop after the
|
||
|
* longest match from the earliest position.
|
||
|
* + If no more set string matched after a previous string match,
|
||
|
* then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
|
||
|
* Stop if spanLength==0, otherwise continue the loop.
|
||
|
*/
|
||
|
|
||
|
int32_t UnicodeSetStringSpan::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
|
||
|
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
|
||
|
return spanNot(s, length);
|
||
|
}
|
||
|
int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
|
||
|
if(spanLength==length) {
|
||
|
return length;
|
||
|
}
|
||
|
|
||
|
// Consider strings; they may overlap with the span.
|
||
|
OffsetList offsets;
|
||
|
if(spanCondition==USET_SPAN_CONTAINED) {
|
||
|
// Use offset list to try all possibilities.
|
||
|
offsets.setMaxLength(maxLength16);
|
||
|
}
|
||
|
int32_t pos=spanLength, rest=length-pos;
|
||
|
int32_t i, stringsLength=strings.size();
|
||
|
for(;;) {
|
||
|
if(spanCondition==USET_SPAN_CONTAINED) {
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
int32_t overlap=spanLengths[i];
|
||
|
if(overlap==ALL_CP_CONTAINED) {
|
||
|
continue; // Irrelevant string. (Also the empty string.)
|
||
|
}
|
||
|
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
|
||
|
const char16_t *s16=string.getBuffer();
|
||
|
int32_t length16=string.length();
|
||
|
U_ASSERT(length>0);
|
||
|
|
||
|
// Try to match this string at pos-overlap..pos.
|
||
|
if(overlap>=LONG_SPAN) {
|
||
|
overlap=length16;
|
||
|
// While contained: No point matching fully inside the code point span.
|
||
|
U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
|
||
|
}
|
||
|
if(overlap>spanLength) {
|
||
|
overlap=spanLength;
|
||
|
}
|
||
|
int32_t inc=length16-overlap; // Keep overlap+inc==length16.
|
||
|
for(;;) {
|
||
|
if(inc>rest) {
|
||
|
break;
|
||
|
}
|
||
|
// Try to match if the increment is not listed already.
|
||
|
if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
|
||
|
if(inc==rest) {
|
||
|
return length; // Reached the end of the string.
|
||
|
}
|
||
|
offsets.addOffset(inc);
|
||
|
}
|
||
|
if(overlap==0) {
|
||
|
break;
|
||
|
}
|
||
|
--overlap;
|
||
|
++inc;
|
||
|
}
|
||
|
}
|
||
|
} else /* USET_SPAN_SIMPLE */ {
|
||
|
int32_t maxInc=0, maxOverlap=0;
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
int32_t overlap=spanLengths[i];
|
||
|
// For longest match, we do need to try to match even an all-contained string
|
||
|
// to find the match from the earliest start.
|
||
|
|
||
|
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
|
||
|
const char16_t *s16=string.getBuffer();
|
||
|
int32_t length16=string.length();
|
||
|
if (length16==0) {
|
||
|
continue; // skip the empty string
|
||
|
}
|
||
|
|
||
|
// Try to match this string at pos-overlap..pos.
|
||
|
if(overlap>=LONG_SPAN) {
|
||
|
overlap=length16;
|
||
|
// Longest match: Need to match fully inside the code point span
|
||
|
// to find the match from the earliest start.
|
||
|
}
|
||
|
if(overlap>spanLength) {
|
||
|
overlap=spanLength;
|
||
|
}
|
||
|
int32_t inc=length16-overlap; // Keep overlap+inc==length16.
|
||
|
for(;;) {
|
||
|
if(inc>rest || overlap<maxOverlap) {
|
||
|
break;
|
||
|
}
|
||
|
// Try to match if the string is longer or starts earlier.
|
||
|
if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
|
||
|
matches16CPB(s, pos-overlap, length, s16, length16)
|
||
|
) {
|
||
|
maxInc=inc; // Longest match from earliest start.
|
||
|
maxOverlap=overlap;
|
||
|
break;
|
||
|
}
|
||
|
--overlap;
|
||
|
++inc;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(maxInc!=0 || maxOverlap!=0) {
|
||
|
// Longest-match algorithm, and there was a string match.
|
||
|
// Simply continue after it.
|
||
|
pos+=maxInc;
|
||
|
rest-=maxInc;
|
||
|
if(rest==0) {
|
||
|
return length; // Reached the end of the string.
|
||
|
}
|
||
|
spanLength=0; // Match strings from after a string match.
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
// Finished trying to match all strings at pos.
|
||
|
|
||
|
if(spanLength!=0 || pos==0) {
|
||
|
// The position is after an unlimited code point span (spanLength!=0),
|
||
|
// not after a string match.
|
||
|
// The only position where spanLength==0 after a span is pos==0.
|
||
|
// Otherwise, an unlimited code point span is only tried again when no
|
||
|
// strings match, and if such a non-initial span fails we stop.
|
||
|
if(offsets.isEmpty()) {
|
||
|
return pos; // No strings matched after a span.
|
||
|
}
|
||
|
// Match strings from after the next string match.
|
||
|
} else {
|
||
|
// The position is after a string match (or a single code point).
|
||
|
if(offsets.isEmpty()) {
|
||
|
// No more strings matched after a previous string match.
|
||
|
// Try another code point span from after the last string match.
|
||
|
spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
|
||
|
if( spanLength==rest || // Reached the end of the string, or
|
||
|
spanLength==0 // neither strings nor span progressed.
|
||
|
) {
|
||
|
return pos+spanLength;
|
||
|
}
|
||
|
pos+=spanLength;
|
||
|
rest-=spanLength;
|
||
|
continue; // spanLength>0: Match strings from after a span.
|
||
|
} else {
|
||
|
// Try to match only one code point from after a string match if some
|
||
|
// string matched beyond it, so that we try all possible positions
|
||
|
// and don't overshoot.
|
||
|
spanLength=spanOne(spanSet, s+pos, rest);
|
||
|
if(spanLength>0) {
|
||
|
if(spanLength==rest) {
|
||
|
return length; // Reached the end of the string.
|
||
|
}
|
||
|
// Match strings after this code point.
|
||
|
// There cannot be any increments below it because UnicodeSet strings
|
||
|
// contain multiple code points.
|
||
|
pos+=spanLength;
|
||
|
rest-=spanLength;
|
||
|
offsets.shift(spanLength);
|
||
|
spanLength=0;
|
||
|
continue; // Match strings from after a single code point.
|
||
|
}
|
||
|
// Match strings from after the next string match.
|
||
|
}
|
||
|
}
|
||
|
int32_t minOffset=offsets.popMinimum();
|
||
|
pos+=minOffset;
|
||
|
rest-=minOffset;
|
||
|
spanLength=0; // Match strings from after a string match.
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int32_t UnicodeSetStringSpan::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
|
||
|
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
|
||
|
return spanNotBack(s, length);
|
||
|
}
|
||
|
int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
|
||
|
if(pos==0) {
|
||
|
return 0;
|
||
|
}
|
||
|
int32_t spanLength=length-pos;
|
||
|
|
||
|
// Consider strings; they may overlap with the span.
|
||
|
OffsetList offsets;
|
||
|
if(spanCondition==USET_SPAN_CONTAINED) {
|
||
|
// Use offset list to try all possibilities.
|
||
|
offsets.setMaxLength(maxLength16);
|
||
|
}
|
||
|
int32_t i, stringsLength=strings.size();
|
||
|
uint8_t *spanBackLengths=spanLengths;
|
||
|
if(all) {
|
||
|
spanBackLengths+=stringsLength;
|
||
|
}
|
||
|
for(;;) {
|
||
|
if(spanCondition==USET_SPAN_CONTAINED) {
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
int32_t overlap=spanBackLengths[i];
|
||
|
if(overlap==ALL_CP_CONTAINED) {
|
||
|
continue; // Irrelevant string. (Also the empty string.)
|
||
|
}
|
||
|
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
|
||
|
const char16_t *s16=string.getBuffer();
|
||
|
int32_t length16=string.length();
|
||
|
U_ASSERT(length>0);
|
||
|
|
||
|
// Try to match this string at pos-(length16-overlap)..pos-length16.
|
||
|
if(overlap>=LONG_SPAN) {
|
||
|
overlap=length16;
|
||
|
// While contained: No point matching fully inside the code point span.
|
||
|
int32_t len1=0;
|
||
|
U16_FWD_1(s16, len1, overlap);
|
||
|
overlap-=len1; // Length of the string minus the first code point.
|
||
|
}
|
||
|
if(overlap>spanLength) {
|
||
|
overlap=spanLength;
|
||
|
}
|
||
|
int32_t dec=length16-overlap; // Keep dec+overlap==length16.
|
||
|
for(;;) {
|
||
|
if(dec>pos) {
|
||
|
break;
|
||
|
}
|
||
|
// Try to match if the decrement is not listed already.
|
||
|
if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
|
||
|
if(dec==pos) {
|
||
|
return 0; // Reached the start of the string.
|
||
|
}
|
||
|
offsets.addOffset(dec);
|
||
|
}
|
||
|
if(overlap==0) {
|
||
|
break;
|
||
|
}
|
||
|
--overlap;
|
||
|
++dec;
|
||
|
}
|
||
|
}
|
||
|
} else /* USET_SPAN_SIMPLE */ {
|
||
|
int32_t maxDec=0, maxOverlap=0;
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
int32_t overlap=spanBackLengths[i];
|
||
|
// For longest match, we do need to try to match even an all-contained string
|
||
|
// to find the match from the latest end.
|
||
|
|
||
|
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
|
||
|
const char16_t *s16=string.getBuffer();
|
||
|
int32_t length16=string.length();
|
||
|
if (length16==0) {
|
||
|
continue; // skip the empty string
|
||
|
}
|
||
|
|
||
|
// Try to match this string at pos-(length16-overlap)..pos-length16.
|
||
|
if(overlap>=LONG_SPAN) {
|
||
|
overlap=length16;
|
||
|
// Longest match: Need to match fully inside the code point span
|
||
|
// to find the match from the latest end.
|
||
|
}
|
||
|
if(overlap>spanLength) {
|
||
|
overlap=spanLength;
|
||
|
}
|
||
|
int32_t dec=length16-overlap; // Keep dec+overlap==length16.
|
||
|
for(;;) {
|
||
|
if(dec>pos || overlap<maxOverlap) {
|
||
|
break;
|
||
|
}
|
||
|
// Try to match if the string is longer or ends later.
|
||
|
if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
|
||
|
matches16CPB(s, pos-dec, length, s16, length16)
|
||
|
) {
|
||
|
maxDec=dec; // Longest match from latest end.
|
||
|
maxOverlap=overlap;
|
||
|
break;
|
||
|
}
|
||
|
--overlap;
|
||
|
++dec;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(maxDec!=0 || maxOverlap!=0) {
|
||
|
// Longest-match algorithm, and there was a string match.
|
||
|
// Simply continue before it.
|
||
|
pos-=maxDec;
|
||
|
if(pos==0) {
|
||
|
return 0; // Reached the start of the string.
|
||
|
}
|
||
|
spanLength=0; // Match strings from before a string match.
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
// Finished trying to match all strings at pos.
|
||
|
|
||
|
if(spanLength!=0 || pos==length) {
|
||
|
// The position is before an unlimited code point span (spanLength!=0),
|
||
|
// not before a string match.
|
||
|
// The only position where spanLength==0 before a span is pos==length.
|
||
|
// Otherwise, an unlimited code point span is only tried again when no
|
||
|
// strings match, and if such a non-initial span fails we stop.
|
||
|
if(offsets.isEmpty()) {
|
||
|
return pos; // No strings matched before a span.
|
||
|
}
|
||
|
// Match strings from before the next string match.
|
||
|
} else {
|
||
|
// The position is before a string match (or a single code point).
|
||
|
if(offsets.isEmpty()) {
|
||
|
// No more strings matched before a previous string match.
|
||
|
// Try another code point span from before the last string match.
|
||
|
int32_t oldPos=pos;
|
||
|
pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
|
||
|
spanLength=oldPos-pos;
|
||
|
if( pos==0 || // Reached the start of the string, or
|
||
|
spanLength==0 // neither strings nor span progressed.
|
||
|
) {
|
||
|
return pos;
|
||
|
}
|
||
|
continue; // spanLength>0: Match strings from before a span.
|
||
|
} else {
|
||
|
// Try to match only one code point from before a string match if some
|
||
|
// string matched beyond it, so that we try all possible positions
|
||
|
// and don't overshoot.
|
||
|
spanLength=spanOneBack(spanSet, s, pos);
|
||
|
if(spanLength>0) {
|
||
|
if(spanLength==pos) {
|
||
|
return 0; // Reached the start of the string.
|
||
|
}
|
||
|
// Match strings before this code point.
|
||
|
// There cannot be any decrements below it because UnicodeSet strings
|
||
|
// contain multiple code points.
|
||
|
pos-=spanLength;
|
||
|
offsets.shift(spanLength);
|
||
|
spanLength=0;
|
||
|
continue; // Match strings from before a single code point.
|
||
|
}
|
||
|
// Match strings from before the next string match.
|
||
|
}
|
||
|
}
|
||
|
pos-=offsets.popMinimum();
|
||
|
spanLength=0; // Match strings from before a string match.
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
|
||
|
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
|
||
|
return spanNotUTF8(s, length);
|
||
|
}
|
||
|
int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
|
||
|
if(spanLength==length) {
|
||
|
return length;
|
||
|
}
|
||
|
|
||
|
// Consider strings; they may overlap with the span.
|
||
|
OffsetList offsets;
|
||
|
if(spanCondition==USET_SPAN_CONTAINED) {
|
||
|
// Use offset list to try all possibilities.
|
||
|
offsets.setMaxLength(maxLength8);
|
||
|
}
|
||
|
int32_t pos=spanLength, rest=length-pos;
|
||
|
int32_t i, stringsLength=strings.size();
|
||
|
uint8_t *spanUTF8Lengths=spanLengths;
|
||
|
if(all) {
|
||
|
spanUTF8Lengths+=2*stringsLength;
|
||
|
}
|
||
|
for(;;) {
|
||
|
const uint8_t *s8=utf8;
|
||
|
int32_t length8;
|
||
|
if(spanCondition==USET_SPAN_CONTAINED) {
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
length8=utf8Lengths[i];
|
||
|
if(length8==0) {
|
||
|
continue; // String not representable in UTF-8.
|
||
|
}
|
||
|
int32_t overlap=spanUTF8Lengths[i];
|
||
|
if(overlap==ALL_CP_CONTAINED) {
|
||
|
s8+=length8;
|
||
|
continue; // Irrelevant string.
|
||
|
}
|
||
|
|
||
|
// Try to match this string at pos-overlap..pos.
|
||
|
if(overlap>=LONG_SPAN) {
|
||
|
overlap=length8;
|
||
|
// While contained: No point matching fully inside the code point span.
|
||
|
U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
|
||
|
}
|
||
|
if(overlap>spanLength) {
|
||
|
overlap=spanLength;
|
||
|
}
|
||
|
int32_t inc=length8-overlap; // Keep overlap+inc==length8.
|
||
|
for(;;) {
|
||
|
if(inc>rest) {
|
||
|
break;
|
||
|
}
|
||
|
// Try to match if the increment is not listed already.
|
||
|
// Match at code point boundaries. (The UTF-8 strings were converted
|
||
|
// from UTF-16 and are guaranteed to be well-formed.)
|
||
|
if(!U8_IS_TRAIL(s[pos-overlap]) &&
|
||
|
!offsets.containsOffset(inc) &&
|
||
|
matches8(s+pos-overlap, s8, length8)) {
|
||
|
if(inc==rest) {
|
||
|
return length; // Reached the end of the string.
|
||
|
}
|
||
|
offsets.addOffset(inc);
|
||
|
}
|
||
|
if(overlap==0) {
|
||
|
break;
|
||
|
}
|
||
|
--overlap;
|
||
|
++inc;
|
||
|
}
|
||
|
s8+=length8;
|
||
|
}
|
||
|
} else /* USET_SPAN_SIMPLE */ {
|
||
|
int32_t maxInc=0, maxOverlap=0;
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
length8=utf8Lengths[i];
|
||
|
if(length8==0) {
|
||
|
continue; // String not representable in UTF-8.
|
||
|
}
|
||
|
int32_t overlap=spanUTF8Lengths[i];
|
||
|
// For longest match, we do need to try to match even an all-contained string
|
||
|
// to find the match from the earliest start.
|
||
|
|
||
|
// Try to match this string at pos-overlap..pos.
|
||
|
if(overlap>=LONG_SPAN) {
|
||
|
overlap=length8;
|
||
|
// Longest match: Need to match fully inside the code point span
|
||
|
// to find the match from the earliest start.
|
||
|
}
|
||
|
if(overlap>spanLength) {
|
||
|
overlap=spanLength;
|
||
|
}
|
||
|
int32_t inc=length8-overlap; // Keep overlap+inc==length8.
|
||
|
for(;;) {
|
||
|
if(inc>rest || overlap<maxOverlap) {
|
||
|
break;
|
||
|
}
|
||
|
// Try to match if the string is longer or starts earlier.
|
||
|
// Match at code point boundaries. (The UTF-8 strings were converted
|
||
|
// from UTF-16 and are guaranteed to be well-formed.)
|
||
|
if(!U8_IS_TRAIL(s[pos-overlap]) &&
|
||
|
(overlap>maxOverlap ||
|
||
|
/* redundant overlap==maxOverlap && */ inc>maxInc) &&
|
||
|
matches8(s+pos-overlap, s8, length8)) {
|
||
|
maxInc=inc; // Longest match from earliest start.
|
||
|
maxOverlap=overlap;
|
||
|
break;
|
||
|
}
|
||
|
--overlap;
|
||
|
++inc;
|
||
|
}
|
||
|
s8+=length8;
|
||
|
}
|
||
|
|
||
|
if(maxInc!=0 || maxOverlap!=0) {
|
||
|
// Longest-match algorithm, and there was a string match.
|
||
|
// Simply continue after it.
|
||
|
pos+=maxInc;
|
||
|
rest-=maxInc;
|
||
|
if(rest==0) {
|
||
|
return length; // Reached the end of the string.
|
||
|
}
|
||
|
spanLength=0; // Match strings from after a string match.
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
// Finished trying to match all strings at pos.
|
||
|
|
||
|
if(spanLength!=0 || pos==0) {
|
||
|
// The position is after an unlimited code point span (spanLength!=0),
|
||
|
// not after a string match.
|
||
|
// The only position where spanLength==0 after a span is pos==0.
|
||
|
// Otherwise, an unlimited code point span is only tried again when no
|
||
|
// strings match, and if such a non-initial span fails we stop.
|
||
|
if(offsets.isEmpty()) {
|
||
|
return pos; // No strings matched after a span.
|
||
|
}
|
||
|
// Match strings from after the next string match.
|
||
|
} else {
|
||
|
// The position is after a string match (or a single code point).
|
||
|
if(offsets.isEmpty()) {
|
||
|
// No more strings matched after a previous string match.
|
||
|
// Try another code point span from after the last string match.
|
||
|
spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
|
||
|
if( spanLength==rest || // Reached the end of the string, or
|
||
|
spanLength==0 // neither strings nor span progressed.
|
||
|
) {
|
||
|
return pos+spanLength;
|
||
|
}
|
||
|
pos+=spanLength;
|
||
|
rest-=spanLength;
|
||
|
continue; // spanLength>0: Match strings from after a span.
|
||
|
} else {
|
||
|
// Try to match only one code point from after a string match if some
|
||
|
// string matched beyond it, so that we try all possible positions
|
||
|
// and don't overshoot.
|
||
|
spanLength=spanOneUTF8(spanSet, s+pos, rest);
|
||
|
if(spanLength>0) {
|
||
|
if(spanLength==rest) {
|
||
|
return length; // Reached the end of the string.
|
||
|
}
|
||
|
// Match strings after this code point.
|
||
|
// There cannot be any increments below it because UnicodeSet strings
|
||
|
// contain multiple code points.
|
||
|
pos+=spanLength;
|
||
|
rest-=spanLength;
|
||
|
offsets.shift(spanLength);
|
||
|
spanLength=0;
|
||
|
continue; // Match strings from after a single code point.
|
||
|
}
|
||
|
// Match strings from after the next string match.
|
||
|
}
|
||
|
}
|
||
|
int32_t minOffset=offsets.popMinimum();
|
||
|
pos+=minOffset;
|
||
|
rest-=minOffset;
|
||
|
spanLength=0; // Match strings from after a string match.
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
|
||
|
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
|
||
|
return spanNotBackUTF8(s, length);
|
||
|
}
|
||
|
int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
|
||
|
if(pos==0) {
|
||
|
return 0;
|
||
|
}
|
||
|
int32_t spanLength=length-pos;
|
||
|
|
||
|
// Consider strings; they may overlap with the span.
|
||
|
OffsetList offsets;
|
||
|
if(spanCondition==USET_SPAN_CONTAINED) {
|
||
|
// Use offset list to try all possibilities.
|
||
|
offsets.setMaxLength(maxLength8);
|
||
|
}
|
||
|
int32_t i, stringsLength=strings.size();
|
||
|
uint8_t *spanBackUTF8Lengths=spanLengths;
|
||
|
if(all) {
|
||
|
spanBackUTF8Lengths+=3*stringsLength;
|
||
|
}
|
||
|
for(;;) {
|
||
|
const uint8_t *s8=utf8;
|
||
|
int32_t length8;
|
||
|
if(spanCondition==USET_SPAN_CONTAINED) {
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
length8=utf8Lengths[i];
|
||
|
if(length8==0) {
|
||
|
continue; // String not representable in UTF-8.
|
||
|
}
|
||
|
int32_t overlap=spanBackUTF8Lengths[i];
|
||
|
if(overlap==ALL_CP_CONTAINED) {
|
||
|
s8+=length8;
|
||
|
continue; // Irrelevant string.
|
||
|
}
|
||
|
|
||
|
// Try to match this string at pos-(length8-overlap)..pos-length8.
|
||
|
if(overlap>=LONG_SPAN) {
|
||
|
overlap=length8;
|
||
|
// While contained: No point matching fully inside the code point span.
|
||
|
int32_t len1=0;
|
||
|
U8_FWD_1(s8, len1, overlap);
|
||
|
overlap-=len1; // Length of the string minus the first code point.
|
||
|
}
|
||
|
if(overlap>spanLength) {
|
||
|
overlap=spanLength;
|
||
|
}
|
||
|
int32_t dec=length8-overlap; // Keep dec+overlap==length8.
|
||
|
for(;;) {
|
||
|
if(dec>pos) {
|
||
|
break;
|
||
|
}
|
||
|
// Try to match if the decrement is not listed already.
|
||
|
// Match at code point boundaries. (The UTF-8 strings were converted
|
||
|
// from UTF-16 and are guaranteed to be well-formed.)
|
||
|
if( !U8_IS_TRAIL(s[pos-dec]) &&
|
||
|
!offsets.containsOffset(dec) &&
|
||
|
matches8(s+pos-dec, s8, length8)
|
||
|
) {
|
||
|
if(dec==pos) {
|
||
|
return 0; // Reached the start of the string.
|
||
|
}
|
||
|
offsets.addOffset(dec);
|
||
|
}
|
||
|
if(overlap==0) {
|
||
|
break;
|
||
|
}
|
||
|
--overlap;
|
||
|
++dec;
|
||
|
}
|
||
|
s8+=length8;
|
||
|
}
|
||
|
} else /* USET_SPAN_SIMPLE */ {
|
||
|
int32_t maxDec=0, maxOverlap=0;
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
length8=utf8Lengths[i];
|
||
|
if(length8==0) {
|
||
|
continue; // String not representable in UTF-8.
|
||
|
}
|
||
|
int32_t overlap=spanBackUTF8Lengths[i];
|
||
|
// For longest match, we do need to try to match even an all-contained string
|
||
|
// to find the match from the latest end.
|
||
|
|
||
|
// Try to match this string at pos-(length8-overlap)..pos-length8.
|
||
|
if(overlap>=LONG_SPAN) {
|
||
|
overlap=length8;
|
||
|
// Longest match: Need to match fully inside the code point span
|
||
|
// to find the match from the latest end.
|
||
|
}
|
||
|
if(overlap>spanLength) {
|
||
|
overlap=spanLength;
|
||
|
}
|
||
|
int32_t dec=length8-overlap; // Keep dec+overlap==length8.
|
||
|
for(;;) {
|
||
|
if(dec>pos || overlap<maxOverlap) {
|
||
|
break;
|
||
|
}
|
||
|
// Try to match if the string is longer or ends later.
|
||
|
// Match at code point boundaries. (The UTF-8 strings were converted
|
||
|
// from UTF-16 and are guaranteed to be well-formed.)
|
||
|
if( !U8_IS_TRAIL(s[pos-dec]) &&
|
||
|
(overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
|
||
|
matches8(s+pos-dec, s8, length8)
|
||
|
) {
|
||
|
maxDec=dec; // Longest match from latest end.
|
||
|
maxOverlap=overlap;
|
||
|
break;
|
||
|
}
|
||
|
--overlap;
|
||
|
++dec;
|
||
|
}
|
||
|
s8+=length8;
|
||
|
}
|
||
|
|
||
|
if(maxDec!=0 || maxOverlap!=0) {
|
||
|
// Longest-match algorithm, and there was a string match.
|
||
|
// Simply continue before it.
|
||
|
pos-=maxDec;
|
||
|
if(pos==0) {
|
||
|
return 0; // Reached the start of the string.
|
||
|
}
|
||
|
spanLength=0; // Match strings from before a string match.
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
// Finished trying to match all strings at pos.
|
||
|
|
||
|
if(spanLength!=0 || pos==length) {
|
||
|
// The position is before an unlimited code point span (spanLength!=0),
|
||
|
// not before a string match.
|
||
|
// The only position where spanLength==0 before a span is pos==length.
|
||
|
// Otherwise, an unlimited code point span is only tried again when no
|
||
|
// strings match, and if such a non-initial span fails we stop.
|
||
|
if(offsets.isEmpty()) {
|
||
|
return pos; // No strings matched before a span.
|
||
|
}
|
||
|
// Match strings from before the next string match.
|
||
|
} else {
|
||
|
// The position is before a string match (or a single code point).
|
||
|
if(offsets.isEmpty()) {
|
||
|
// No more strings matched before a previous string match.
|
||
|
// Try another code point span from before the last string match.
|
||
|
int32_t oldPos=pos;
|
||
|
pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
|
||
|
spanLength=oldPos-pos;
|
||
|
if( pos==0 || // Reached the start of the string, or
|
||
|
spanLength==0 // neither strings nor span progressed.
|
||
|
) {
|
||
|
return pos;
|
||
|
}
|
||
|
continue; // spanLength>0: Match strings from before a span.
|
||
|
} else {
|
||
|
// Try to match only one code point from before a string match if some
|
||
|
// string matched beyond it, so that we try all possible positions
|
||
|
// and don't overshoot.
|
||
|
spanLength=spanOneBackUTF8(spanSet, s, pos);
|
||
|
if(spanLength>0) {
|
||
|
if(spanLength==pos) {
|
||
|
return 0; // Reached the start of the string.
|
||
|
}
|
||
|
// Match strings before this code point.
|
||
|
// There cannot be any decrements below it because UnicodeSet strings
|
||
|
// contain multiple code points.
|
||
|
pos-=spanLength;
|
||
|
offsets.shift(spanLength);
|
||
|
spanLength=0;
|
||
|
continue; // Match strings from before a single code point.
|
||
|
}
|
||
|
// Match strings from before the next string match.
|
||
|
}
|
||
|
}
|
||
|
pos-=offsets.popMinimum();
|
||
|
spanLength=0; // Match strings from before a string match.
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
|
||
|
*
|
||
|
* Theoretical algorithm:
|
||
|
* - Iterate through the string, and at each code point boundary:
|
||
|
* + If the code point there is in the set, then return with the current position.
|
||
|
* + If a set string matches at the current position, then return with the current position.
|
||
|
*
|
||
|
* Optimized implementation:
|
||
|
*
|
||
|
* (Same assumption as for span() above.)
|
||
|
*
|
||
|
* Create and cache a spanNotSet which contains all of the single code points
|
||
|
* of the original set but none of its strings.
|
||
|
* For each set string add its initial code point to the spanNotSet.
|
||
|
* (Also add its final code point for spanNotBack().)
|
||
|
*
|
||
|
* - Loop:
|
||
|
* + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
|
||
|
* + If the current code point is in the original set, then
|
||
|
* return the current position.
|
||
|
* + If any set string matches at the current position, then
|
||
|
* return the current position.
|
||
|
* + If there is no match at the current position, neither for the code point there
|
||
|
* nor for any set string, then skip this code point and continue the loop.
|
||
|
* This happens for set-string-initial code points that were added to spanNotSet
|
||
|
* when there is not actually a match for such a set string.
|
||
|
*/
|
||
|
|
||
|
int32_t UnicodeSetStringSpan::spanNot(const char16_t *s, int32_t length) const {
|
||
|
int32_t pos=0, rest=length;
|
||
|
int32_t i, stringsLength=strings.size();
|
||
|
do {
|
||
|
// Span until we find a code point from the set,
|
||
|
// or a code point that starts or ends some string.
|
||
|
i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
|
||
|
if(i==rest) {
|
||
|
return length; // Reached the end of the string.
|
||
|
}
|
||
|
pos+=i;
|
||
|
rest-=i;
|
||
|
|
||
|
// Check whether the current code point is in the original set,
|
||
|
// without the string starts and ends.
|
||
|
int32_t cpLength=spanOne(spanSet, s+pos, rest);
|
||
|
if(cpLength>0) {
|
||
|
return pos; // There is a set element at pos.
|
||
|
}
|
||
|
|
||
|
// Try to match the strings at pos.
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
if(spanLengths[i]==ALL_CP_CONTAINED) {
|
||
|
continue; // Irrelevant string. (Also the empty string.)
|
||
|
}
|
||
|
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
|
||
|
const char16_t *s16=string.getBuffer();
|
||
|
int32_t length16=string.length();
|
||
|
U_ASSERT(length>0);
|
||
|
if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
|
||
|
return pos; // There is a set element at pos.
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// The span(while not contained) ended on a string start/end which is
|
||
|
// not in the original set. Skip this code point and continue.
|
||
|
// cpLength<0
|
||
|
pos-=cpLength;
|
||
|
rest+=cpLength;
|
||
|
} while(rest!=0);
|
||
|
return length; // Reached the end of the string.
|
||
|
}
|
||
|
|
||
|
int32_t UnicodeSetStringSpan::spanNotBack(const char16_t *s, int32_t length) const {
|
||
|
int32_t pos=length;
|
||
|
int32_t i, stringsLength=strings.size();
|
||
|
do {
|
||
|
// Span until we find a code point from the set,
|
||
|
// or a code point that starts or ends some string.
|
||
|
pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
|
||
|
if(pos==0) {
|
||
|
return 0; // Reached the start of the string.
|
||
|
}
|
||
|
|
||
|
// Check whether the current code point is in the original set,
|
||
|
// without the string starts and ends.
|
||
|
int32_t cpLength=spanOneBack(spanSet, s, pos);
|
||
|
if(cpLength>0) {
|
||
|
return pos; // There is a set element at pos.
|
||
|
}
|
||
|
|
||
|
// Try to match the strings at pos.
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
// Use spanLengths rather than a spanBackLengths pointer because
|
||
|
// it is easier and we only need to know whether the string is irrelevant
|
||
|
// which is the same in either array.
|
||
|
if(spanLengths[i]==ALL_CP_CONTAINED) {
|
||
|
continue; // Irrelevant string. (Also the empty string.)
|
||
|
}
|
||
|
const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
|
||
|
const char16_t *s16=string.getBuffer();
|
||
|
int32_t length16=string.length();
|
||
|
U_ASSERT(length>0);
|
||
|
if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
|
||
|
return pos; // There is a set element at pos.
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// The span(while not contained) ended on a string start/end which is
|
||
|
// not in the original set. Skip this code point and continue.
|
||
|
// cpLength<0
|
||
|
pos+=cpLength;
|
||
|
} while(pos!=0);
|
||
|
return 0; // Reached the start of the string.
|
||
|
}
|
||
|
|
||
|
int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
|
||
|
int32_t pos=0, rest=length;
|
||
|
int32_t i, stringsLength=strings.size();
|
||
|
uint8_t *spanUTF8Lengths=spanLengths;
|
||
|
if(all) {
|
||
|
spanUTF8Lengths+=2*stringsLength;
|
||
|
}
|
||
|
do {
|
||
|
// Span until we find a code point from the set,
|
||
|
// or a code point that starts or ends some string.
|
||
|
i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
|
||
|
if(i==rest) {
|
||
|
return length; // Reached the end of the string.
|
||
|
}
|
||
|
pos+=i;
|
||
|
rest-=i;
|
||
|
|
||
|
// Check whether the current code point is in the original set,
|
||
|
// without the string starts and ends.
|
||
|
int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
|
||
|
if(cpLength>0) {
|
||
|
return pos; // There is a set element at pos.
|
||
|
}
|
||
|
|
||
|
// Try to match the strings at pos.
|
||
|
const uint8_t *s8=utf8;
|
||
|
int32_t length8;
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
length8=utf8Lengths[i];
|
||
|
// ALL_CP_CONTAINED: Irrelevant string.
|
||
|
if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
|
||
|
return pos; // There is a set element at pos.
|
||
|
}
|
||
|
s8+=length8;
|
||
|
}
|
||
|
|
||
|
// The span(while not contained) ended on a string start/end which is
|
||
|
// not in the original set. Skip this code point and continue.
|
||
|
// cpLength<0
|
||
|
pos-=cpLength;
|
||
|
rest+=cpLength;
|
||
|
} while(rest!=0);
|
||
|
return length; // Reached the end of the string.
|
||
|
}
|
||
|
|
||
|
int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
|
||
|
int32_t pos=length;
|
||
|
int32_t i, stringsLength=strings.size();
|
||
|
uint8_t *spanBackUTF8Lengths=spanLengths;
|
||
|
if(all) {
|
||
|
spanBackUTF8Lengths+=3*stringsLength;
|
||
|
}
|
||
|
do {
|
||
|
// Span until we find a code point from the set,
|
||
|
// or a code point that starts or ends some string.
|
||
|
pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
|
||
|
if(pos==0) {
|
||
|
return 0; // Reached the start of the string.
|
||
|
}
|
||
|
|
||
|
// Check whether the current code point is in the original set,
|
||
|
// without the string starts and ends.
|
||
|
int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
|
||
|
if(cpLength>0) {
|
||
|
return pos; // There is a set element at pos.
|
||
|
}
|
||
|
|
||
|
// Try to match the strings at pos.
|
||
|
const uint8_t *s8=utf8;
|
||
|
int32_t length8;
|
||
|
for(i=0; i<stringsLength; ++i) {
|
||
|
length8=utf8Lengths[i];
|
||
|
// ALL_CP_CONTAINED: Irrelevant string.
|
||
|
if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
|
||
|
return pos; // There is a set element at pos.
|
||
|
}
|
||
|
s8+=length8;
|
||
|
}
|
||
|
|
||
|
// The span(while not contained) ended on a string start/end which is
|
||
|
// not in the original set. Skip this code point and continue.
|
||
|
// cpLength<0
|
||
|
pos+=cpLength;
|
||
|
} while(pos!=0);
|
||
|
return 0; // Reached the start of the string.
|
||
|
}
|
||
|
|
||
|
U_NAMESPACE_END
|