mirror of
https://github.com/openjdk/jdk.git
synced 2026-03-02 04:00:16 +00:00
8160302: Reduce number of lambdas created when loading java.util.regex.Pattern
Reviewed-by: sherman, martin
This commit is contained in:
parent
4d92a188b2
commit
00a5fb9183
@ -32,164 +32,195 @@ import java.util.regex.Pattern.BmpCharPredicate;
|
||||
|
||||
class CharPredicates {
|
||||
|
||||
static final CharPredicate ALPHABETIC = Character::isAlphabetic;
|
||||
static final CharPredicate ALPHABETIC() {
|
||||
return Character::isAlphabetic;
|
||||
}
|
||||
|
||||
// \p{gc=Decimal_Number}
|
||||
static final CharPredicate DIGIT = Character::isDigit;
|
||||
static final CharPredicate DIGIT() {
|
||||
return Character::isDigit;
|
||||
}
|
||||
|
||||
static final CharPredicate LETTER = Character::isLetter;
|
||||
static final CharPredicate LETTER() {
|
||||
return Character::isLetter;
|
||||
}
|
||||
|
||||
static final CharPredicate IDEOGRAPHIC = Character::isIdeographic;
|
||||
static final CharPredicate IDEOGRAPHIC() {
|
||||
return Character::isIdeographic;
|
||||
}
|
||||
|
||||
static final CharPredicate LOWERCASE = Character::isLowerCase;
|
||||
static final CharPredicate LOWERCASE() {
|
||||
return Character::isLowerCase;
|
||||
}
|
||||
|
||||
static final CharPredicate UPPERCASE = Character::isUpperCase;
|
||||
static final CharPredicate UPPERCASE() {
|
||||
return Character::isUpperCase;
|
||||
}
|
||||
|
||||
static final CharPredicate TITLECASE = Character::isTitleCase;
|
||||
static final CharPredicate TITLECASE() {
|
||||
return Character::isTitleCase;
|
||||
}
|
||||
|
||||
// \p{Whitespace}
|
||||
static final CharPredicate WHITE_SPACE = ch ->
|
||||
((((1 << Character.SPACE_SEPARATOR) |
|
||||
(1 << Character.LINE_SEPARATOR) |
|
||||
(1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
|
||||
!= 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
|
||||
static final CharPredicate WHITE_SPACE() {
|
||||
return ch ->
|
||||
((((1 << Character.SPACE_SEPARATOR) |
|
||||
(1 << Character.LINE_SEPARATOR) |
|
||||
(1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
|
||||
!= 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
|
||||
}
|
||||
|
||||
// \p{gc=Control}
|
||||
static final CharPredicate CONTROL = ch ->
|
||||
Character.getType(ch) == Character.CONTROL;
|
||||
static final CharPredicate CONTROL() {
|
||||
return ch -> Character.getType(ch) == Character.CONTROL;
|
||||
}
|
||||
|
||||
// \p{gc=Punctuation}
|
||||
static final CharPredicate PUNCTUATION = ch ->
|
||||
((((1 << Character.CONNECTOR_PUNCTUATION) |
|
||||
(1 << Character.DASH_PUNCTUATION) |
|
||||
(1 << Character.START_PUNCTUATION) |
|
||||
(1 << Character.END_PUNCTUATION) |
|
||||
(1 << Character.OTHER_PUNCTUATION) |
|
||||
(1 << Character.INITIAL_QUOTE_PUNCTUATION) |
|
||||
(1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
|
||||
!= 0;
|
||||
static final CharPredicate PUNCTUATION() {
|
||||
return ch ->
|
||||
((((1 << Character.CONNECTOR_PUNCTUATION) |
|
||||
(1 << Character.DASH_PUNCTUATION) |
|
||||
(1 << Character.START_PUNCTUATION) |
|
||||
(1 << Character.END_PUNCTUATION) |
|
||||
(1 << Character.OTHER_PUNCTUATION) |
|
||||
(1 << Character.INITIAL_QUOTE_PUNCTUATION) |
|
||||
(1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
|
||||
!= 0;
|
||||
}
|
||||
|
||||
// \p{gc=Decimal_Number}
|
||||
// \p{Hex_Digit} -> PropList.txt: Hex_Digit
|
||||
static final CharPredicate HEX_DIGIT = DIGIT.union(
|
||||
ch -> (ch >= 0x0030 && ch <= 0x0039) ||
|
||||
(ch >= 0x0041 && ch <= 0x0046) ||
|
||||
(ch >= 0x0061 && ch <= 0x0066) ||
|
||||
(ch >= 0xFF10 && ch <= 0xFF19) ||
|
||||
(ch >= 0xFF21 && ch <= 0xFF26) ||
|
||||
(ch >= 0xFF41 && ch <= 0xFF46));
|
||||
static final CharPredicate HEX_DIGIT() {
|
||||
return DIGIT().union(ch -> (ch >= 0x0030 && ch <= 0x0039) ||
|
||||
(ch >= 0x0041 && ch <= 0x0046) ||
|
||||
(ch >= 0x0061 && ch <= 0x0066) ||
|
||||
(ch >= 0xFF10 && ch <= 0xFF19) ||
|
||||
(ch >= 0xFF21 && ch <= 0xFF26) ||
|
||||
(ch >= 0xFF41 && ch <= 0xFF46));
|
||||
}
|
||||
|
||||
static final CharPredicate ASSIGNED = ch ->
|
||||
Character.getType(ch) != Character.UNASSIGNED;
|
||||
static final CharPredicate ASSIGNED() {
|
||||
return ch -> Character.getType(ch) != Character.UNASSIGNED;
|
||||
}
|
||||
|
||||
// PropList.txt:Noncharacter_Code_Point
|
||||
static final CharPredicate NONCHARACTER_CODE_POINT = ch ->
|
||||
(ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
|
||||
static final CharPredicate NONCHARACTER_CODE_POINT() {
|
||||
return ch -> (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
|
||||
}
|
||||
|
||||
// \p{alpha}
|
||||
// \p{digit}
|
||||
static final CharPredicate ALNUM = ALPHABETIC.union(DIGIT);
|
||||
static final CharPredicate ALNUM() {
|
||||
return ALPHABETIC().union(DIGIT());
|
||||
}
|
||||
|
||||
// \p{Whitespace} --
|
||||
// [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
|
||||
// \p{gc=Line_Separator}
|
||||
// \p{gc=Paragraph_Separator}]
|
||||
static final CharPredicate BLANK = ch ->
|
||||
Character.getType(ch) == Character.SPACE_SEPARATOR ||
|
||||
ch == 0x9; // \N{HT}
|
||||
static final CharPredicate BLANK() {
|
||||
return ch ->
|
||||
Character.getType(ch) == Character.SPACE_SEPARATOR ||
|
||||
ch == 0x9; // \N{HT}
|
||||
}
|
||||
|
||||
// [^
|
||||
// \p{space}
|
||||
// \p{gc=Control}
|
||||
// \p{gc=Surrogate}
|
||||
// \p{gc=Unassigned}]
|
||||
static final CharPredicate GRAPH = ch ->
|
||||
((((1 << Character.SPACE_SEPARATOR) |
|
||||
(1 << Character.LINE_SEPARATOR) |
|
||||
(1 << Character.PARAGRAPH_SEPARATOR) |
|
||||
(1 << Character.CONTROL) |
|
||||
(1 << Character.SURROGATE) |
|
||||
(1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
|
||||
== 0;
|
||||
static final CharPredicate GRAPH() {
|
||||
return ch ->
|
||||
((((1 << Character.SPACE_SEPARATOR) |
|
||||
(1 << Character.LINE_SEPARATOR) |
|
||||
(1 << Character.PARAGRAPH_SEPARATOR) |
|
||||
(1 << Character.CONTROL) |
|
||||
(1 << Character.SURROGATE) |
|
||||
(1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
|
||||
== 0;
|
||||
}
|
||||
|
||||
// \p{graph}
|
||||
// \p{blank}
|
||||
// -- \p{cntrl}
|
||||
static final CharPredicate PRINT = GRAPH.union(BLANK).and(CONTROL.negate());
|
||||
static final CharPredicate PRINT() {
|
||||
return GRAPH().union(BLANK()).and(CONTROL().negate());
|
||||
}
|
||||
|
||||
// 200C..200D PropList.txt:Join_Control
|
||||
static final CharPredicate JOIN_CONTROL = ch -> ch == 0x200C || ch == 0x200D;
|
||||
static final CharPredicate JOIN_CONTROL() {
|
||||
return ch -> ch == 0x200C || ch == 0x200D;
|
||||
}
|
||||
|
||||
// \p{alpha}
|
||||
// \p{gc=Mark}
|
||||
// \p{digit}
|
||||
// \p{gc=Connector_Punctuation}
|
||||
// \p{Join_Control} 200C..200D
|
||||
static final CharPredicate WORD =
|
||||
ALPHABETIC.union(ch -> ((((1 << Character.NON_SPACING_MARK) |
|
||||
static final CharPredicate WORD() {
|
||||
return ALPHABETIC().union(ch -> ((((1 << Character.NON_SPACING_MARK) |
|
||||
(1 << Character.ENCLOSING_MARK) |
|
||||
(1 << Character.COMBINING_SPACING_MARK) |
|
||||
(1 << Character.DECIMAL_DIGIT_NUMBER) |
|
||||
(1 << Character.CONNECTOR_PUNCTUATION))
|
||||
>> Character.getType(ch)) & 1) != 0,
|
||||
JOIN_CONTROL);
|
||||
JOIN_CONTROL());
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private static final HashMap<String, CharPredicate> posix = new HashMap<>(12);
|
||||
private static final HashMap<String, CharPredicate> uprops = new HashMap<>(18);
|
||||
|
||||
private static void defPosix(String name, CharPredicate p) {
|
||||
posix.put(name, p);
|
||||
}
|
||||
private static void defUProp(String name, CharPredicate p) {
|
||||
uprops.put(name, p);
|
||||
private static CharPredicate getPosixPredicate(String name) {
|
||||
switch (name) {
|
||||
case "ALPHA": return ALPHABETIC();
|
||||
case "LOWER": return LOWERCASE();
|
||||
case "UPPER": return UPPERCASE();
|
||||
case "SPACE": return WHITE_SPACE();
|
||||
case "PUNCT": return PUNCTUATION();
|
||||
case "XDIGIT": return HEX_DIGIT();
|
||||
case "ALNUM": return ALNUM();
|
||||
case "CNTRL": return CONTROL();
|
||||
case "DIGIT": return DIGIT();
|
||||
case "BLANK": return BLANK();
|
||||
case "GRAPH": return GRAPH();
|
||||
case "PRINT": return PRINT();
|
||||
default: return null;
|
||||
}
|
||||
}
|
||||
|
||||
static {
|
||||
defPosix("ALPHA", ALPHABETIC);
|
||||
defPosix("LOWER", LOWERCASE);
|
||||
defPosix("UPPER", UPPERCASE);
|
||||
defPosix("SPACE", WHITE_SPACE);
|
||||
defPosix("PUNCT", PUNCTUATION);
|
||||
defPosix("XDIGIT",HEX_DIGIT);
|
||||
defPosix("ALNUM", ALNUM);
|
||||
defPosix("CNTRL", CONTROL);
|
||||
defPosix("DIGIT", DIGIT);
|
||||
defPosix("BLANK", BLANK);
|
||||
defPosix("GRAPH", GRAPH);
|
||||
defPosix("PRINT", PRINT);
|
||||
|
||||
defUProp("ALPHABETIC", ALPHABETIC);
|
||||
defUProp("ASSIGNED", ASSIGNED);
|
||||
defUProp("CONTROL", CONTROL);
|
||||
defUProp("HEXDIGIT", HEX_DIGIT);
|
||||
defUProp("IDEOGRAPHIC", IDEOGRAPHIC);
|
||||
defUProp("JOINCONTROL", JOIN_CONTROL);
|
||||
defUProp("LETTER", LETTER);
|
||||
defUProp("LOWERCASE", LOWERCASE);
|
||||
defUProp("NONCHARACTERCODEPOINT", NONCHARACTER_CODE_POINT);
|
||||
defUProp("TITLECASE", TITLECASE);
|
||||
defUProp("PUNCTUATION", PUNCTUATION);
|
||||
defUProp("UPPERCASE", UPPERCASE);
|
||||
defUProp("WHITESPACE", WHITE_SPACE);
|
||||
defUProp("WORD", WORD);
|
||||
defUProp("WHITE_SPACE", WHITE_SPACE);
|
||||
defUProp("HEX_DIGIT", HEX_DIGIT);
|
||||
defUProp("NONCHARACTER_CODE_POINT", NONCHARACTER_CODE_POINT);
|
||||
defUProp("JOIN_CONTROL", JOIN_CONTROL);
|
||||
private static CharPredicate getUnicodePredicate(String name) {
|
||||
switch (name) {
|
||||
case "ALPHABETIC": return ALPHABETIC();
|
||||
case "ASSIGNED": return ASSIGNED();
|
||||
case "CONTROL": return CONTROL();
|
||||
case "HEXDIGIT": return HEX_DIGIT();
|
||||
case "IDEOGRAPHIC": return IDEOGRAPHIC();
|
||||
case "JOINCONTROL": return JOIN_CONTROL();
|
||||
case "LETTER": return LETTER();
|
||||
case "LOWERCASE": return LOWERCASE();
|
||||
case "NONCHARACTERCODEPOINT": return NONCHARACTER_CODE_POINT();
|
||||
case "TITLECASE": return TITLECASE();
|
||||
case "PUNCTUATION": return PUNCTUATION();
|
||||
case "UPPERCASE": return UPPERCASE();
|
||||
case "WHITESPACE": return WHITE_SPACE();
|
||||
case "WORD": return WORD();
|
||||
case "WHITE_SPACE": return WHITE_SPACE();
|
||||
case "HEX_DIGIT": return HEX_DIGIT();
|
||||
case "NONCHARACTER_CODE_POINT": return NONCHARACTER_CODE_POINT();
|
||||
case "JOIN_CONTROL": return JOIN_CONTROL();
|
||||
default: return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static CharPredicate forUnicodeProperty(String propName) {
|
||||
propName = propName.toUpperCase(Locale.ROOT);
|
||||
CharPredicate p = uprops.get(propName);
|
||||
CharPredicate p = getUnicodePredicate(propName);
|
||||
if (p != null)
|
||||
return p;
|
||||
return posix.get(propName);
|
||||
return getPosixPredicate(propName);
|
||||
}
|
||||
|
||||
public static CharPredicate forPOSIXName(String propName) {
|
||||
return posix.get(propName.toUpperCase(Locale.ENGLISH));
|
||||
return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
@ -223,145 +254,130 @@ class CharPredicates {
|
||||
|
||||
// unicode categories, aliases, properties, java methods ...
|
||||
|
||||
private static final HashMap<String, CharPredicate> props = new HashMap<>(128);
|
||||
|
||||
/**
|
||||
* Returns a predicate matching all characters in a named property.
|
||||
*/
|
||||
static CharPredicate forProperty(String name) {
|
||||
return props.get(name);
|
||||
}
|
||||
|
||||
private static void defProp(String name, CharPredicate p) {
|
||||
props.put(name, p);
|
||||
}
|
||||
|
||||
private static void defCategory(String name, final int typeMask) {
|
||||
CharPredicate p = ch -> (typeMask & (1 << Character.getType(ch))) != 0;
|
||||
props.put(name, p);
|
||||
}
|
||||
|
||||
private static void defRange(String name, final int lower, final int upper) {
|
||||
BmpCharPredicate p = ch -> lower <= ch && ch <= upper;
|
||||
props.put(name, p);
|
||||
}
|
||||
|
||||
private static void defCtype(String name, final int ctype) {
|
||||
BmpCharPredicate p = ch -> ch < 128 && ASCII.isType(ch, ctype);
|
||||
// PrintPattern.pmap.put(p, name);
|
||||
props.put(name, p);
|
||||
}
|
||||
|
||||
static {
|
||||
// Unicode character property aliases, defined in
|
||||
// http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
|
||||
defCategory("Cn", 1<<Character.UNASSIGNED);
|
||||
defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
|
||||
defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
|
||||
defCategory("Lt", 1<<Character.TITLECASE_LETTER);
|
||||
defCategory("Lm", 1<<Character.MODIFIER_LETTER);
|
||||
defCategory("Lo", 1<<Character.OTHER_LETTER);
|
||||
defCategory("Mn", 1<<Character.NON_SPACING_MARK);
|
||||
defCategory("Me", 1<<Character.ENCLOSING_MARK);
|
||||
defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
|
||||
defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
|
||||
defCategory("Nl", 1<<Character.LETTER_NUMBER);
|
||||
defCategory("No", 1<<Character.OTHER_NUMBER);
|
||||
defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
|
||||
defCategory("Zl", 1<<Character.LINE_SEPARATOR);
|
||||
defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
|
||||
defCategory("Cc", 1<<Character.CONTROL);
|
||||
defCategory("Cf", 1<<Character.FORMAT);
|
||||
defCategory("Co", 1<<Character.PRIVATE_USE);
|
||||
defCategory("Cs", 1<<Character.SURROGATE);
|
||||
defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
|
||||
defCategory("Ps", 1<<Character.START_PUNCTUATION);
|
||||
defCategory("Pe", 1<<Character.END_PUNCTUATION);
|
||||
defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
|
||||
defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
|
||||
defCategory("Sm", 1<<Character.MATH_SYMBOL);
|
||||
defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
|
||||
defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
|
||||
defCategory("So", 1<<Character.OTHER_SYMBOL);
|
||||
defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
|
||||
defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
|
||||
defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
|
||||
(1<<Character.LOWERCASE_LETTER) |
|
||||
(1<<Character.TITLECASE_LETTER) |
|
||||
(1<<Character.MODIFIER_LETTER) |
|
||||
(1<<Character.OTHER_LETTER)));
|
||||
defCategory("M", ((1<<Character.NON_SPACING_MARK) |
|
||||
(1<<Character.ENCLOSING_MARK) |
|
||||
(1<<Character.COMBINING_SPACING_MARK)));
|
||||
defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
|
||||
(1<<Character.LETTER_NUMBER) |
|
||||
(1<<Character.OTHER_NUMBER)));
|
||||
defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
|
||||
(1<<Character.LINE_SEPARATOR) |
|
||||
(1<<Character.PARAGRAPH_SEPARATOR)));
|
||||
defCategory("C", ((1<<Character.CONTROL) |
|
||||
(1<<Character.FORMAT) |
|
||||
(1<<Character.PRIVATE_USE) |
|
||||
(1<<Character.SURROGATE) |
|
||||
(1<<Character.UNASSIGNED))); // Other
|
||||
defCategory("P", ((1<<Character.DASH_PUNCTUATION) |
|
||||
(1<<Character.START_PUNCTUATION) |
|
||||
(1<<Character.END_PUNCTUATION) |
|
||||
(1<<Character.CONNECTOR_PUNCTUATION) |
|
||||
(1<<Character.OTHER_PUNCTUATION) |
|
||||
(1<<Character.INITIAL_QUOTE_PUNCTUATION) |
|
||||
(1<<Character.FINAL_QUOTE_PUNCTUATION)));
|
||||
defCategory("S", ((1<<Character.MATH_SYMBOL) |
|
||||
(1<<Character.CURRENCY_SYMBOL) |
|
||||
(1<<Character.MODIFIER_SYMBOL) |
|
||||
(1<<Character.OTHER_SYMBOL)));
|
||||
defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
|
||||
(1<<Character.LOWERCASE_LETTER) |
|
||||
(1<<Character.TITLECASE_LETTER)));
|
||||
defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
|
||||
(1<<Character.LOWERCASE_LETTER) |
|
||||
(1<<Character.TITLECASE_LETTER) |
|
||||
(1<<Character.MODIFIER_LETTER) |
|
||||
(1<<Character.OTHER_LETTER) |
|
||||
(1<<Character.DECIMAL_DIGIT_NUMBER)));
|
||||
defRange("L1", 0x00, 0xFF); // Latin-1
|
||||
props.put("all", ch -> true);
|
||||
switch (name) {
|
||||
case "Cn": return category(1<<Character.UNASSIGNED);
|
||||
case "Lu": return category(1<<Character.UPPERCASE_LETTER);
|
||||
case "Ll": return category(1<<Character.LOWERCASE_LETTER);
|
||||
case "Lt": return category(1<<Character.TITLECASE_LETTER);
|
||||
case "Lm": return category(1<<Character.MODIFIER_LETTER);
|
||||
case "Lo": return category(1<<Character.OTHER_LETTER);
|
||||
case "Mn": return category(1<<Character.NON_SPACING_MARK);
|
||||
case "Me": return category(1<<Character.ENCLOSING_MARK);
|
||||
case "Mc": return category(1<<Character.COMBINING_SPACING_MARK);
|
||||
case "Nd": return category(1<<Character.DECIMAL_DIGIT_NUMBER);
|
||||
case "Nl": return category(1<<Character.LETTER_NUMBER);
|
||||
case "No": return category(1<<Character.OTHER_NUMBER);
|
||||
case "Zs": return category(1<<Character.SPACE_SEPARATOR);
|
||||
case "Zl": return category(1<<Character.LINE_SEPARATOR);
|
||||
case "Zp": return category(1<<Character.PARAGRAPH_SEPARATOR);
|
||||
case "Cc": return category(1<<Character.CONTROL);
|
||||
case "Cf": return category(1<<Character.FORMAT);
|
||||
case "Co": return category(1<<Character.PRIVATE_USE);
|
||||
case "Cs": return category(1<<Character.SURROGATE);
|
||||
case "Pd": return category(1<<Character.DASH_PUNCTUATION);
|
||||
case "Ps": return category(1<<Character.START_PUNCTUATION);
|
||||
case "Pe": return category(1<<Character.END_PUNCTUATION);
|
||||
case "Pc": return category(1<<Character.CONNECTOR_PUNCTUATION);
|
||||
case "Po": return category(1<<Character.OTHER_PUNCTUATION);
|
||||
case "Sm": return category(1<<Character.MATH_SYMBOL);
|
||||
case "Sc": return category(1<<Character.CURRENCY_SYMBOL);
|
||||
case "Sk": return category(1<<Character.MODIFIER_SYMBOL);
|
||||
case "So": return category(1<<Character.OTHER_SYMBOL);
|
||||
case "Pi": return category(1<<Character.INITIAL_QUOTE_PUNCTUATION);
|
||||
case "Pf": return category(1<<Character.FINAL_QUOTE_PUNCTUATION);
|
||||
case "L": return category(((1<<Character.UPPERCASE_LETTER) |
|
||||
(1<<Character.LOWERCASE_LETTER) |
|
||||
(1<<Character.TITLECASE_LETTER) |
|
||||
(1<<Character.MODIFIER_LETTER) |
|
||||
(1<<Character.OTHER_LETTER)));
|
||||
case "M": return category(((1<<Character.NON_SPACING_MARK) |
|
||||
(1<<Character.ENCLOSING_MARK) |
|
||||
(1<<Character.COMBINING_SPACING_MARK)));
|
||||
case "N": return category(((1<<Character.DECIMAL_DIGIT_NUMBER) |
|
||||
(1<<Character.LETTER_NUMBER) |
|
||||
(1<<Character.OTHER_NUMBER)));
|
||||
case "Z": return category(((1<<Character.SPACE_SEPARATOR) |
|
||||
(1<<Character.LINE_SEPARATOR) |
|
||||
(1<<Character.PARAGRAPH_SEPARATOR)));
|
||||
case "C": return category(((1<<Character.CONTROL) |
|
||||
(1<<Character.FORMAT) |
|
||||
(1<<Character.PRIVATE_USE) |
|
||||
(1<<Character.SURROGATE) |
|
||||
(1<<Character.UNASSIGNED))); // Other
|
||||
case "P": return category(((1<<Character.DASH_PUNCTUATION) |
|
||||
(1<<Character.START_PUNCTUATION) |
|
||||
(1<<Character.END_PUNCTUATION) |
|
||||
(1<<Character.CONNECTOR_PUNCTUATION) |
|
||||
(1<<Character.OTHER_PUNCTUATION) |
|
||||
(1<<Character.INITIAL_QUOTE_PUNCTUATION) |
|
||||
(1<<Character.FINAL_QUOTE_PUNCTUATION)));
|
||||
case "S": return category(((1<<Character.MATH_SYMBOL) |
|
||||
(1<<Character.CURRENCY_SYMBOL) |
|
||||
(1<<Character.MODIFIER_SYMBOL) |
|
||||
(1<<Character.OTHER_SYMBOL)));
|
||||
case "LC": return category(((1<<Character.UPPERCASE_LETTER) |
|
||||
(1<<Character.LOWERCASE_LETTER) |
|
||||
(1<<Character.TITLECASE_LETTER)));
|
||||
case "LD": return category(((1<<Character.UPPERCASE_LETTER) |
|
||||
(1<<Character.LOWERCASE_LETTER) |
|
||||
(1<<Character.TITLECASE_LETTER) |
|
||||
(1<<Character.MODIFIER_LETTER) |
|
||||
(1<<Character.OTHER_LETTER) |
|
||||
(1<<Character.DECIMAL_DIGIT_NUMBER)));
|
||||
case "L1": return range(0x00, 0xFF); // Latin-1
|
||||
case "all": return Pattern.ALL();
|
||||
// Posix regular expression character classes, defined in
|
||||
// http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
|
||||
case "ASCII": return range(0x00, 0x7F); // ASCII
|
||||
case "Alnum": return ctype(ASCII.ALNUM); // Alphanumeric characters
|
||||
case "Alpha": return ctype(ASCII.ALPHA); // Alphabetic characters
|
||||
case "Blank": return ctype(ASCII.BLANK); // Space and tab characters
|
||||
case "Cntrl": return ctype(ASCII.CNTRL); // Control characters
|
||||
case "Digit": return range('0', '9'); // Numeric characters
|
||||
case "Graph": return ctype(ASCII.GRAPH); // printable and visible
|
||||
case "Lower": return range('a', 'z'); // Lower-case alphabetic
|
||||
case "Print": return range(0x20, 0x7E); // Printable characters
|
||||
case "Punct": return ctype(ASCII.PUNCT); // Punctuation characters
|
||||
case "Space": return ctype(ASCII.SPACE); // Space characters
|
||||
case "Upper": return range('A', 'Z'); // Upper-case alphabetic
|
||||
case "XDigit": return ctype(ASCII.XDIGIT); // hexadecimal digits
|
||||
|
||||
// Posix regular expression character classes, defined in
|
||||
// http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
|
||||
defRange("ASCII", 0x00, 0x7F); // ASCII
|
||||
defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters
|
||||
defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters
|
||||
defCtype("Blank", ASCII.BLANK); // Space and tab characters
|
||||
defCtype("Cntrl", ASCII.CNTRL); // Control characters
|
||||
defRange("Digit", '0', '9'); // Numeric characters
|
||||
defCtype("Graph", ASCII.GRAPH); // printable and visible
|
||||
defRange("Lower", 'a', 'z'); // Lower-case alphabetic
|
||||
defRange("Print", 0x20, 0x7E); // Printable characters
|
||||
defCtype("Punct", ASCII.PUNCT); // Punctuation characters
|
||||
defCtype("Space", ASCII.SPACE); // Space characters
|
||||
defRange("Upper", 'A', 'Z'); // Upper-case alphabetic
|
||||
defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
|
||||
// Java character properties, defined by methods in Character.java
|
||||
case "javaLowerCase": return java.lang.Character::isLowerCase;
|
||||
case "javaUpperCase": return Character::isUpperCase;
|
||||
case "javaAlphabetic": return java.lang.Character::isAlphabetic;
|
||||
case "javaIdeographic": return java.lang.Character::isIdeographic;
|
||||
case "javaTitleCase": return java.lang.Character::isTitleCase;
|
||||
case "javaDigit": return java.lang.Character::isDigit;
|
||||
case "javaDefined": return java.lang.Character::isDefined;
|
||||
case "javaLetter": return java.lang.Character::isLetter;
|
||||
case "javaLetterOrDigit": return java.lang.Character::isLetterOrDigit;
|
||||
case "javaJavaIdentifierStart": return java.lang.Character::isJavaIdentifierStart;
|
||||
case "javaJavaIdentifierPart": return java.lang.Character::isJavaIdentifierPart;
|
||||
case "javaUnicodeIdentifierStart": return java.lang.Character::isUnicodeIdentifierStart;
|
||||
case "javaUnicodeIdentifierPart": return java.lang.Character::isUnicodeIdentifierPart;
|
||||
case "javaIdentifierIgnorable": return java.lang.Character::isIdentifierIgnorable;
|
||||
case "javaSpaceChar": return java.lang.Character::isSpaceChar;
|
||||
case "javaWhitespace": return java.lang.Character::isWhitespace;
|
||||
case "javaISOControl": return java.lang.Character::isISOControl;
|
||||
case "javaMirrored": return java.lang.Character::isMirrored;
|
||||
default: return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Java character properties, defined by methods in Character.java
|
||||
defProp("javaLowerCase", java.lang.Character::isLowerCase);
|
||||
defProp("javaUpperCase", Character::isUpperCase);
|
||||
defProp("javaAlphabetic", java.lang.Character::isAlphabetic);
|
||||
defProp("javaIdeographic", java.lang.Character::isIdeographic);
|
||||
defProp("javaTitleCase", java.lang.Character::isTitleCase);
|
||||
defProp("javaDigit", java.lang.Character::isDigit);
|
||||
defProp("javaDefined", java.lang.Character::isDefined);
|
||||
defProp("javaLetter", java.lang.Character::isLetter);
|
||||
defProp("javaLetterOrDigit", java.lang.Character::isLetterOrDigit);
|
||||
defProp("javaJavaIdentifierStart", java.lang.Character::isJavaIdentifierStart);
|
||||
defProp("javaJavaIdentifierPart", java.lang.Character::isJavaIdentifierPart);
|
||||
defProp("javaUnicodeIdentifierStart", java.lang.Character::isUnicodeIdentifierStart);
|
||||
defProp("javaUnicodeIdentifierPart", java.lang.Character::isUnicodeIdentifierPart);
|
||||
defProp("javaIdentifierIgnorable", java.lang.Character::isIdentifierIgnorable);
|
||||
defProp("javaSpaceChar", java.lang.Character::isSpaceChar);
|
||||
defProp("javaWhitespace", java.lang.Character::isWhitespace);
|
||||
defProp("javaISOControl", java.lang.Character::isISOControl);
|
||||
defProp("javaMirrored", java.lang.Character::isMirrored);
|
||||
private static CharPredicate category(final int typeMask) {
|
||||
return ch -> (typeMask & (1 << Character.getType(ch))) != 0;
|
||||
}
|
||||
|
||||
private static CharPredicate range(final int lower, final int upper) {
|
||||
return (BmpCharPredicate)ch -> lower <= ch && ch <= upper;
|
||||
}
|
||||
|
||||
private static CharPredicate ctype(final int ctype) {
|
||||
return (BmpCharPredicate)ch -> ch < 128 && ASCII.isType(ch, ctype);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
@ -369,8 +385,14 @@ class CharPredicates {
|
||||
/**
|
||||
* Posix ASCII variants, not in the lookup map
|
||||
*/
|
||||
static final BmpCharPredicate ASCII_DIGIT = ch -> ch < 128 && ASCII.isDigit(ch);
|
||||
static final BmpCharPredicate ASCII_WORD = ch -> ch < 128 && ASCII.isWord(ch);
|
||||
static final BmpCharPredicate ASCII_SPACE = ch -> ch < 128 && ASCII.isSpace(ch);
|
||||
static final BmpCharPredicate ASCII_DIGIT() {
|
||||
return ch -> ch < 128 && ASCII.isDigit(ch);
|
||||
}
|
||||
static final BmpCharPredicate ASCII_WORD() {
|
||||
return ch -> ch < 128 && ASCII.isWord(ch);
|
||||
}
|
||||
static final BmpCharPredicate ASCII_SPACE() {
|
||||
return ch -> ch < 128 && ASCII.isSpace(ch);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1495,7 +1495,7 @@ public final class Pattern
|
||||
altns.add(seq);
|
||||
produceEquivalentAlternation(nfd, altns);
|
||||
dst.append("(?:");
|
||||
altns.forEach( s -> dst.append(s + "|"));
|
||||
altns.forEach( s -> dst.append(s).append('|'));
|
||||
dst.delete(dst.length() - 1, dst.length());
|
||||
dst.append(")");
|
||||
continue;
|
||||
@ -2142,12 +2142,12 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
case '.':
|
||||
next();
|
||||
if (has(DOTALL)) {
|
||||
node = new CharProperty(ALL);
|
||||
node = new CharProperty(ALL());
|
||||
} else {
|
||||
if (has(UNIX_LINES)) {
|
||||
node = new CharProperty(UNIXDOT);
|
||||
node = new CharProperty(UNIXDOT());
|
||||
} else {
|
||||
node = new CharProperty(DOT);
|
||||
node = new CharProperty(DOT());
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -2376,7 +2376,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
case 'D':
|
||||
if (create) {
|
||||
predicate = has(UNICODE_CHARACTER_CLASS) ?
|
||||
CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
|
||||
CharPredicates.DIGIT() : CharPredicates.ASCII_DIGIT();
|
||||
predicate = predicate.negate();
|
||||
if (!inclass)
|
||||
root = newCharProperty(predicate);
|
||||
@ -2391,7 +2391,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
return -1;
|
||||
case 'H':
|
||||
if (create) {
|
||||
predicate = HorizWS.negate();
|
||||
predicate = HorizWS().negate();
|
||||
if (!inclass)
|
||||
root = newCharProperty(predicate);
|
||||
}
|
||||
@ -2415,7 +2415,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
case 'S':
|
||||
if (create) {
|
||||
predicate = has(UNICODE_CHARACTER_CLASS) ?
|
||||
CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
|
||||
CharPredicates.WHITE_SPACE() : CharPredicates.ASCII_SPACE();
|
||||
predicate = predicate.negate();
|
||||
if (!inclass)
|
||||
root = newCharProperty(predicate);
|
||||
@ -2426,7 +2426,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
break;
|
||||
case 'V':
|
||||
if (create) {
|
||||
predicate = VertWS.negate();
|
||||
predicate = VertWS().negate();
|
||||
if (!inclass)
|
||||
root = newCharProperty(predicate);
|
||||
}
|
||||
@ -2434,7 +2434,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
case 'W':
|
||||
if (create) {
|
||||
predicate = has(UNICODE_CHARACTER_CLASS) ?
|
||||
CharPredicates.WORD : CharPredicates.ASCII_WORD;
|
||||
CharPredicates.WORD() : CharPredicates.ASCII_WORD();
|
||||
predicate = predicate.negate();
|
||||
if (!inclass)
|
||||
root = newCharProperty(predicate);
|
||||
@ -2480,7 +2480,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
case 'd':
|
||||
if (create) {
|
||||
predicate = has(UNICODE_CHARACTER_CLASS) ?
|
||||
CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
|
||||
CharPredicates.DIGIT() : CharPredicates.ASCII_DIGIT();
|
||||
if (!inclass)
|
||||
root = newCharProperty(predicate);
|
||||
}
|
||||
@ -2493,7 +2493,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
break;
|
||||
case 'h':
|
||||
if (create) {
|
||||
predicate = HorizWS;
|
||||
predicate = HorizWS();
|
||||
if (!inclass)
|
||||
root = newCharProperty(predicate);
|
||||
}
|
||||
@ -2531,7 +2531,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
case 's':
|
||||
if (create) {
|
||||
predicate = has(UNICODE_CHARACTER_CLASS) ?
|
||||
CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
|
||||
CharPredicates.WHITE_SPACE() : CharPredicates.ASCII_SPACE();
|
||||
if (!inclass)
|
||||
root = newCharProperty(predicate);
|
||||
}
|
||||
@ -2552,7 +2552,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
if (isrange)
|
||||
return '\013';
|
||||
if (create) {
|
||||
predicate = VertWS;
|
||||
predicate = VertWS();
|
||||
if (!inclass)
|
||||
root = newCharProperty(predicate);
|
||||
}
|
||||
@ -2560,7 +2560,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
case 'w':
|
||||
if (create) {
|
||||
predicate = has(UNICODE_CHARACTER_CLASS) ?
|
||||
CharPredicates.WORD : CharPredicates.ASCII_WORD;
|
||||
CharPredicates.WORD() : CharPredicates.ASCII_WORD();
|
||||
if (!inclass)
|
||||
root = newCharProperty(predicate);
|
||||
}
|
||||
@ -2704,7 +2704,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
(6)AngstromSign u+212b
|
||||
toLowerCase(u+212b) ==> u+00e5
|
||||
*/
|
||||
int d;
|
||||
if (ch < 256 &&
|
||||
!(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
|
||||
(ch == 0xff || ch == 0xb5 ||
|
||||
@ -5384,7 +5383,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
}
|
||||
|
||||
boolean isWord(int ch) {
|
||||
return useUWORD ? CharPredicates.WORD.is(ch)
|
||||
return useUWORD ? CharPredicates.WORD().is(ch)
|
||||
: (ch == '_' || Character.isLetterOrDigit(ch));
|
||||
}
|
||||
|
||||
@ -5680,33 +5679,45 @@ NEXT: while (i <= last) {
|
||||
/**
|
||||
* matches a Perl vertical whitespace
|
||||
*/
|
||||
static BmpCharPredicate VertWS = cp ->
|
||||
(cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029;
|
||||
static BmpCharPredicate VertWS() {
|
||||
return cp -> (cp >= 0x0A && cp <= 0x0D) ||
|
||||
cp == 0x85 || cp == 0x2028 || cp == 0x2029;
|
||||
}
|
||||
|
||||
/**
|
||||
* matches a Perl horizontal whitespace
|
||||
*/
|
||||
static BmpCharPredicate HorizWS = cp ->
|
||||
cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 ||
|
||||
cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f ||
|
||||
cp == 0x205f || cp == 0x3000;
|
||||
static BmpCharPredicate HorizWS() {
|
||||
return cp ->
|
||||
cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 ||
|
||||
cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f ||
|
||||
cp == 0x205f || cp == 0x3000;
|
||||
}
|
||||
|
||||
/**
|
||||
* for the Unicode category ALL and the dot metacharacter when
|
||||
* in dotall mode.
|
||||
*/
|
||||
static CharPredicate ALL = ch -> true;
|
||||
static CharPredicate ALL() {
|
||||
return ch -> true;
|
||||
}
|
||||
|
||||
/**
|
||||
* for the dot metacharacter when dotall is not enabled.
|
||||
*/
|
||||
static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r'
|
||||
&& (ch|1) != '\u2029'
|
||||
&& ch != '\u0085');
|
||||
static CharPredicate DOT() {
|
||||
return ch ->
|
||||
(ch != '\n' && ch != '\r'
|
||||
&& (ch|1) != '\u2029'
|
||||
&& ch != '\u0085');
|
||||
}
|
||||
|
||||
/**
|
||||
* the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled.
|
||||
*/
|
||||
static CharPredicate UNIXDOT = ch -> ch != '\n';
|
||||
static CharPredicate UNIXDOT() {
|
||||
return ch -> ch != '\n';
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicate that matches a Supplementary Unicode character
|
||||
|
||||
@ -27,7 +27,6 @@ package java.util.regex;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.regex.Pattern.CharPredicate;
|
||||
import java.util.regex.CharPredicates;
|
||||
import static java.util.regex.ASCII.*;
|
||||
|
||||
/**
|
||||
@ -106,15 +105,15 @@ class PrintPattern {
|
||||
static HashMap<CharPredicate, String> pmap;
|
||||
static {
|
||||
pmap = new HashMap<>();
|
||||
pmap.put(Pattern.ALL, "All");
|
||||
pmap.put(Pattern.DOT, "Dot");
|
||||
pmap.put(Pattern.UNIXDOT, "UnixDot");
|
||||
pmap.put(Pattern.VertWS, "VertWS");
|
||||
pmap.put(Pattern.HorizWS, "HorizWS");
|
||||
pmap.put(Pattern.ALL(), "All");
|
||||
pmap.put(Pattern.DOT(), "Dot");
|
||||
pmap.put(Pattern.UNIXDOT(), "UnixDot");
|
||||
pmap.put(Pattern.VertWS(), "VertWS");
|
||||
pmap.put(Pattern.HorizWS(), "HorizWS");
|
||||
|
||||
pmap.put(CharPredicates.ASCII_DIGIT, "ASCII.DIGIT");
|
||||
pmap.put(CharPredicates.ASCII_WORD, "ASCII.WORD");
|
||||
pmap.put(CharPredicates.ASCII_SPACE, "ASCII.SPACE");
|
||||
pmap.put(CharPredicates.ASCII_DIGIT(), "ASCII.DIGIT");
|
||||
pmap.put(CharPredicates.ASCII_WORD(), "ASCII.WORD");
|
||||
pmap.put(CharPredicates.ASCII_SPACE(), "ASCII.SPACE");
|
||||
}
|
||||
|
||||
static void walk(Pattern.Node node, int depth) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user