Java源码示例:com.ibm.icu.text.UnicodeSet
示例1
public final void addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the trie */
Iterator<Trie2.Range> trieIterator=trie.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
set.add(range.startCodePoint);
}
/* add code points with hardcoded properties, plus the ones following them */
/* (none right now, see comment below) */
/*
* Omit code points with hardcoded specialcasing properties
* because we do not build property UnicodeSets for them right now.
*/
}
示例2
private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
/* add the start code point to the USet */
set.add(start);
if(start!=end && isAlgorithmicNoNo(value)) {
// Range of code points with same-norm16-value algorithmic decompositions.
// They might have different non-zero FCD16 values.
int prevFCD16=getFCD16(start);
while(++start<=end) {
int fcd16=getFCD16(start);
if(fcd16!=prevFCD16) {
set.add(start);
prevFCD16=fcd16;
}
}
}
}
示例3
/**
* Returns true if there are characters whose decomposition starts with c.
* If so, then the set is cleared and then filled with those characters.
* <b>{@link #ensureCanonIterData()} must have been called before this method,
* or else this method will crash.</b>
* @param c A Unicode code point.
* @param set A UnicodeSet to receive the characters whose decompositions
* start with c, if there are any.
* @return true if there are characters whose decomposition starts with c.
*/
public boolean getCanonStartSet(int c, UnicodeSet set) {
int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
if(canonValue==0) {
return false;
}
set.clear();
int value=canonValue&CANON_VALUE_MASK;
if((canonValue&CANON_HAS_SET)!=0) {
set.addAll(canonStartSets.get(value));
} else if(value!=0) {
set.add(value);
}
if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
int norm16=getNorm16(c);
if(norm16==JAMO_L) {
int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
} else {
addComposites(getCompositionsList(norm16), set);
}
}
return true;
}
示例4
/**
* @param list some character's compositions list
* @param set recursively receives the composites from these compositions
*/
private void addComposites(int list, UnicodeSet set) {
int firstUnit, compositeAndFwd;
do {
firstUnit=maybeYesCompositions.charAt(list);
if((firstUnit&COMP_1_TRIPLE)==0) {
compositeAndFwd=maybeYesCompositions.charAt(list+1);
list+=2;
} else {
compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
maybeYesCompositions.charAt(list+2);
list+=3;
}
int composite=compositeAndFwd>>1;
if((compositeAndFwd&1)!=0) {
addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
}
set.add(composite);
} while((firstUnit&COMP_1_LAST_TUPLE)==0);
}
示例5
void suppressContractions(UnicodeSet set) {
if(set.isEmpty()) { return; }
UnicodeSetIterator iter = new UnicodeSetIterator(set);
while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
int c = iter.codepoint;
int ce32 = trie.get(c);
if(ce32 == Collation.FALLBACK_CE32) {
ce32 = base.getFinalCE32(base.getCE32(c));
if(Collation.ce32HasContext(ce32)) {
ce32 = copyFromBaseCE32(c, ce32, false /* without context */);
trie.set(c, ce32);
}
} else if(isBuilderContextCE32(ce32)) {
ce32 = getConditionalCE32ForCE32(ce32).ce32;
// Simply abandon the list of ConditionalCE32.
// The caller will copy this builder in the end,
// eliminating unreachable data.
trie.set(c, ce32);
contextChars.remove(c);
}
}
modified = true;
}
示例6
protected void setDigitTags() {
UnicodeSet digits = new UnicodeSet("[:Nd:]");
UnicodeSetIterator iter = new UnicodeSetIterator(digits);
while(iter.next()) {
assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
int c = iter.codepoint;
int ce32 = trie.get(c);
if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) {
int index = addCE32(ce32);
if(index > Collation.MAX_INDEX) {
throw new IndexOutOfBoundsException("too many mappings");
// BufferOverflowException is a better fit
// but cannot be constructed with a message string.
}
ce32 = Collation.makeCE32FromTagIndexAndLength(
Collation.DIGIT_TAG, index, UCharacter.digit(c)); // u_charDigitValue(c)
trie.set(c, ce32);
}
}
}
示例7
/**
* Create a new ICUTransformFilter that transforms text on the given stream.
*
* @param input {@link TokenStream} to filter.
* @param transform Transliterator to transform the text.
*/
@SuppressWarnings("deprecation")
public ICUTransformFilter(TokenStream input, Transliterator transform) {
super(input);
this.transform = transform;
/*
* This is cheating, but speeds things up a lot.
* If we wanted to use pkg-private APIs we could probably do better.
*/
if (transform.getFilter() == null && transform instanceof com.ibm.icu.text.RuleBasedTransliterator) {
final UnicodeSet sourceSet = transform.getSourceSet();
if (sourceSet != null && !sourceSet.isEmpty())
transform.setFilter(sourceSet);
}
}
示例8
/** Creates a new ICUFoldingFilterFactory */
public ICUFoldingFilterFactory(Map<String,String> args) {
super(args);
Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER;
String filter = get(args, "filter");
if (filter != null) {
UnicodeSet set = new UnicodeSet(filter);
if (!set.isEmpty()) {
set.freeze();
normalizer = new FilteredNormalizer2(normalizer, set);
}
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
this.normalizer = normalizer;
}
示例9
/** Creates a new ICUNormalizer2CharFilterFactory */
public ICUNormalizer2CharFilterFactory(Map<String,String> args) {
super(args);
String form = get(args, "form", "nfkc_cf");
String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose");
Normalizer2 normalizer = Normalizer2.getInstance
(null, form, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
String filter = get(args, "filter");
if (filter != null) {
UnicodeSet set = new UnicodeSet(filter);
if (!set.isEmpty()) {
set.freeze();
normalizer = new FilteredNormalizer2(normalizer, set);
}
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
this.normalizer = normalizer;
}
示例10
public void addLcccChars(UnicodeSet set) {
int start = 0;
CodePointMap.Range range = new CodePointMap.Range();
while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT,
null, range)) {
int end = range.getEnd();
int norm16 = range.getValue();
if (norm16 > MIN_NORMAL_MAYBE_YES && norm16 != JAMO_VT) {
set.add(start, end);
} else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
int fcd16 = getFCD16(start);
if (fcd16 > 0xff) { set.add(start, end); }
}
start = end + 1;
}
}
示例11
/**
* Returns true if there are characters whose decomposition starts with c.
* If so, then the set is cleared and then filled with those characters.
* <b>{@link #ensureCanonIterData()} must have been called before this method,
* or else this method will crash.</b>
* @param c A Unicode code point.
* @param set A UnicodeSet to receive the characters whose decompositions
* start with c, if there are any.
* @return true if there are characters whose decomposition starts with c.
*/
public boolean getCanonStartSet(int c, UnicodeSet set) {
int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
if(canonValue==0) {
return false;
}
set.clear();
int value=canonValue&CANON_VALUE_MASK;
if((canonValue&CANON_HAS_SET)!=0) {
set.addAll(canonStartSets.get(value));
} else if(value!=0) {
set.add(value);
}
if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
int norm16 = getRawNorm16(c);
if(norm16==JAMO_L) {
int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
} else {
addComposites(getCompositionsList(norm16), set);
}
}
return true;
}
示例12
/**
* @param list some character's compositions list
* @param set recursively receives the composites from these compositions
*/
private void addComposites(int list, UnicodeSet set) {
int firstUnit, compositeAndFwd;
do {
firstUnit=maybeYesCompositions.charAt(list);
if((firstUnit&COMP_1_TRIPLE)==0) {
compositeAndFwd=maybeYesCompositions.charAt(list+1);
list+=2;
} else {
compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
maybeYesCompositions.charAt(list+2);
list+=3;
}
int composite=compositeAndFwd>>1;
if((compositeAndFwd&1)!=0) {
addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
}
set.add(composite);
} while((firstUnit&COMP_1_LAST_TUPLE)==0);
}
示例13
/**
* dovec - fill in arcs for each element of a cvec
* all kinds of MCCE complexity removed.
*/
private void dovec(UnicodeSet set, State lp, State rp) throws RegexException {
int rangeCount = set.getRangeCount();
for (int rx = 0; rx < rangeCount; rx++) {
int rangeStart = set.getRangeStart(rx);
int rangeEnd = set.getRangeEnd(rx);
/*
* Note: ICU operates in UTF-32 here, and the ColorMap is happy to play along.
*/
if (LOG.isDebugEnabled() && IS_DEBUG) {
LOG.debug(String.format("%s %d %4x %4x", set, rx, rangeStart, rangeEnd));
}
//TODO: this arc is probably redundant.
if (rangeStart == rangeEnd) {
nfa.newarc(PLAIN, cm.subcolor(rangeStart), lp, rp);
}
cm.subrange(rangeStart, rangeEnd, lp, rp);
}
}
示例14
/**
* {@icu} Constructs a string tokenizer for the specified string. All
* characters in the delim argument are the delimiters for separating
* tokens.
* <p>If the returnDelims flag is false, the delimiter characters are
* skipped and only serve as separators between tokens.
* <p>If the returnDelims flag is true, then the delimiter characters
* are also returned as tokens. If coalescedelims is true, one token
* is returned for each run of delimiter characters, otherwise one
* token is returned per delimiter. Since surrogate pairs can be
* delimiters, the returned token might be two chars in length.
* @param str a string to be parsed.
* @param delim the delimiters.
* @param returndelims flag indicating whether to return the delimiters
* as tokens.
* @param coalescedelims flag indicating whether to return a run of
* delimiters as a single token or as one token per delimiter.
* This only takes effect if returndelims is true.
* @exception NullPointerException if str is null
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public StringTokenizer(String str, UnicodeSet delim, boolean returndelims, boolean coalescedelims)
{
m_source_ = str;
m_length_ = str.length();
if (delim == null) {
m_delimiters_ = EMPTY_DELIMITER_;
}
else {
m_delimiters_ = delim;
}
m_returnDelimiters_ = returndelims;
m_coalesceDelimiters_ = coalescedelims;
m_tokenOffset_ = -1;
m_tokenSize_ = -1;
if (m_length_ == 0) {
// string length 0, no tokens
m_nextOffset_ = -1;
}
else {
m_nextOffset_ = 0;
if (!returndelims) {
m_nextOffset_ = getNextNonDelimiter(0);
}
}
}
示例15
/**
* Returns the set of exemplar characters for a locale.
*
* @param options Bitmask for options to apply to the exemplar pattern.
* Specify zero to retrieve the exemplar set as it is
* defined in the locale data. Specify
* UnicodeSet.CASE to retrieve a case-folded exemplar
* set. See {@link UnicodeSet#applyPattern(String,
* int)} for a complete list of valid options. The
* IGNORE_SPACE bit is always set, regardless of the
* value of 'options'.
* @param extype The type of exemplar set to be retrieved,
* ES_STANDARD, ES_INDEX, ES_AUXILIARY, or ES_PUNCTUATION
* @return The set of exemplar characters for the given locale.
* If there is nothing available for the locale,
* then null is returned if {@link #getNoSubstitute()} is true, otherwise the
* root value is returned (which may be UnicodeSet.EMPTY).
* @exception RuntimeException if the extype is invalid.
* @stable ICU 3.4
*/
public UnicodeSet getExemplarSet(int options, int extype) {
String [] exemplarSetTypes = {
"ExemplarCharacters",
"AuxExemplarCharacters",
"ExemplarCharactersIndex",
"ExemplarCharactersCurrency",
"ExemplarCharactersPunctuation"
};
if (extype == ES_CURRENCY) {
// currency symbol exemplar is no longer available
return noSubstitute ? null : UnicodeSet.EMPTY;
}
try{
final String aKey = exemplarSetTypes[extype]; // will throw an out-of-bounds exception
ICUResourceBundle stringBundle = (ICUResourceBundle) bundle.get(aKey);
if (noSubstitute && !bundle.isRoot() && stringBundle.isRoot()) {
return null;
}
String unicodeSetPattern = stringBundle.getString();
return new UnicodeSet(unicodeSetPattern, UnicodeSet.IGNORE_SPACE | options);
} catch (ArrayIndexOutOfBoundsException aiooe) {
throw new IllegalArgumentException(aiooe);
} catch (Exception ex){
return noSubstitute ? null : UnicodeSet.EMPTY;
}
}
示例16
private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
try {
pos.setIndex(i);
UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
x.complement().complement(); // hack to fix toPattern
result.append(x.toPattern(false));
i = pos.getIndex() - 1; // allow for the loop increment
return i;
} catch (Exception e) {
throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
}
}
示例17
/**
* Does the set contain the next code point?
* If so, return its length; otherwise return its negative length.
*/
static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
char c = s.charAt(start);
if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
char c2 = s.charAt(start + 1);
if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) {
int supplementary = Character.toCodePoint(c, c2);
return set.contains(supplementary) ? 2 : -2;
}
}
return set.contains(c) ? 1 : -1;
}
示例18
static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
char c = s.charAt(length - 1);
if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
char c2 = s.charAt(length - 2);
if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) {
int supplementary = Character.toCodePoint(c2, c);
return set.contains(supplementary) ? 2 : -2;
}
}
return set.contains(c) ? 1 : -1;
}
示例19
public void addLcccChars(UnicodeSet set) {
/* add the start code point of each same-value range of each trie */
Iterator<Trie2.Range> trieIterator=normTrie.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set);
}
}
示例20
/**
* eclass - Because we have no MCCE support, this
* just processing single characters.
*/
static UnicodeSet eclass(char c, boolean cases) {
/* otherwise, none */
if (cases) {
return allcases(c);
} else {
UnicodeSet set = new UnicodeSet();
set.add(c);
return set;
}
}
示例21
public void addCanonIterPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the canonical iterator data trie */
ensureCanonIterData();
// currently only used for the SEGMENT_STARTER property
Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper);
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
/* add the start code point to the USet */
set.add(range.startCodePoint);
}
}
示例22
private int parseUnicodeSet(int i, UnicodeSet set) throws ParseException {
// Collect a UnicodeSet pattern between a balanced pair of [brackets].
int level = 0;
int j = i;
for(;;) {
if(j == rules.length()) {
setParseError("unbalanced UnicodeSet pattern brackets");
return j;
}
char c = rules.charAt(j++);
if(c == 0x5b) { // '['
++level;
} else if(c == 0x5d) { // ']'
if(--level == 0) { break; }
}
}
try {
set.applyPattern(rules.substring(i, j));
} catch(Exception e) {
setParseError("not a valid UnicodeSet pattern: " + e.getMessage());
}
j = skipWhiteSpace(j);
if(j == rules.length() || rules.charAt(j) != 0x5d) {
setParseError("missing option-terminating ']' after UnicodeSet pattern");
return j;
}
return ++j;
}
示例23
void optimize(UnicodeSet set) {
if(set.isEmpty()) { return; }
UnicodeSetIterator iter = new UnicodeSetIterator(set);
while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
int c = iter.codepoint;
int ce32 = trie.get(c);
if(ce32 == Collation.FALLBACK_CE32) {
ce32 = base.getFinalCE32(base.getCE32(c));
ce32 = copyFromBaseCE32(c, ce32, true);
trie.set(c, ce32);
}
}
modified = true;
}
示例24
public void upropsvec_addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the properties vectors trie */
if(m_additionalColumnsCount_>0) {
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
set.add(range.startCodePoint);
}
}
}
示例25
private void enumCnERange(int start, int end, int ce32, ContractionsAndExpansions cne) {
if (cne.checkTailored == 0) {
// There is no tailoring.
// No need to collect nor check the tailored set.
} else if (cne.checkTailored < 0) {
// Collect the set of code points with mappings in the tailoring data.
if (ce32 == Collation.FALLBACK_CE32) {
return; // fallback to base, not tailored
} else {
cne.tailored.add(start, end);
}
// checkTailored > 0: Exclude tailored ranges from the base data enumeration.
} else if (start == end) {
if (cne.tailored.contains(start)) {
return;
}
} else if (cne.tailored.containsSome(start, end)) {
if (cne.ranges == null) {
cne.ranges = new UnicodeSet();
}
cne.ranges.set(start, end).removeAll(cne.tailored);
int count = cne.ranges.getRangeCount();
for (int i = 0; i < count; ++i) {
cne.handleCE32(cne.ranges.getRangeStart(i), cne.ranges.getRangeEnd(i), ce32);
}
}
cne.handleCE32(start, end, ce32);
}
示例26
void addStrings(int start, int end, UnicodeSet set) {
if (set == null) {
return;
}
StringBuilder s = new StringBuilder(unreversedPrefix);
do {
s.appendCodePoint(start);
if (suffix != null) {
s.append(suffix);
}
set.add(s);
s.setLength(unreversedPrefix.length());
} while (++start <= end);
}
示例27
public void upropsvec_addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the properties vectors trie */
if(m_additionalColumnsCount_>0) {
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
set.add(range.startCodePoint);
}
}
}
示例28
private static void expandSingleRule
(StringBuilder builder, String leftHandSide, String rightHandSide)
throws IllegalArgumentException {
UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE);
boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches();
for (UnicodeSetIterator it = new UnicodeSetIterator(set) ; it.nextRange() ; ) {
if (it.codepoint != UnicodeSetIterator.IS_STRING) {
if (numericValue) {
for (int cp = it.codepoint ; cp <= it.codepointEnd ; ++cp) {
builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>');
builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp)));
builder.append(" # ").append(UCharacter.getName(cp));
builder.append("\n");
}
} else {
builder.append(String.format(Locale.ROOT, "%04X", it.codepoint));
if (it.codepointEnd > it.codepoint) {
builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd));
}
builder.append('>').append(rightHandSide).append("\n");
}
} else {
System.err.println("ERROR: String '" + it.getString() + "' found in UnicodeSet");
System.exit(1);
}
}
}
示例29
public void testEmojiFromTheFuture() throws Exception {
// pick an unassigned character with extended_pictographic
int ch = new UnicodeSet("[[:Extended_Pictographic:]&[:Unassigned:]]").getRangeStart(0);
String value = new String(Character.toChars(ch));
// should analyze to emoji type
BaseTokenStreamTestCase.assertAnalyzesTo(a, value,
new String[] { value },
new String[] { "<EMOJI>" });
// shouldn't break in a sequence
BaseTokenStreamTestCase.assertAnalyzesTo(a, value + '\u200D' + value,
new String[] { value + '\u200D' + value },
new String[] { "<EMOJI>" });
}
示例30
public void testOptimizer() throws Exception {
String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
assertTrue(custom.getFilter() == null);
final KeywordTokenizer input = new KeywordTokenizer();
input.setReader(new StringReader(""));
new ICUTransformFilter(input, custom);
assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
}