mirror of
https://github.com/openjdk/jdk.git
synced 2026-02-16 13:25:34 +00:00
6945564: Unicode script support in Character class
6948903: Make Unicode scripts available for use in regular expressions Added Unicode script suport Reviewed-by: martin
This commit is contained in:
parent
d1dc8092ec
commit
bb2be852aa
@ -34,6 +34,7 @@ JAVA_JAVA_java = \
|
||||
java/lang/Thread.java \
|
||||
java/lang/Character.java \
|
||||
java/lang/CharacterData.java \
|
||||
java/lang/CharacterName.java \
|
||||
sun/misc/ASCIICaseInsensitiveComparator.java \
|
||||
sun/misc/VM.java \
|
||||
sun/misc/Signal.java \
|
||||
|
||||
@ -384,6 +384,27 @@ clean::
|
||||
$(RM) $(GENSRCDIR)/java/lang/CharacterDataUndefined.java
|
||||
$(RM) $(GENSRCDIR)/java/lang/CharacterDataPrivateUse.java
|
||||
|
||||
#
|
||||
# Rules to generate classes/java/lang/uniName.dat
|
||||
#
|
||||
|
||||
|
||||
|
||||
UNINAME = $(CLASSBINDIR)/java/lang/uniName.dat
|
||||
GENERATEUNINAME_JARFILE = $(BUILDTOOLJARDIR)/generatecharacter.jar
|
||||
|
||||
build: $(UNINAME)
|
||||
|
||||
$(UNINAME): $(UNICODEDATA)/UnicodeData.txt \
|
||||
$(GENERATECHARACTER_JARFILE)
|
||||
@$(prep-target)
|
||||
$(BOOT_JAVA_CMD) -classpath $(GENERATECHARACTER_JARFILE) \
|
||||
build.tools.generatecharacter.CharacterName \
|
||||
$(UNICODEDATA)/UnicodeData.txt $(UNINAME)
|
||||
|
||||
clean::
|
||||
$(RM) $(UNINAME)
|
||||
|
||||
#
|
||||
# End of rules to create $(GENSRCDIR)/java/lang/CharacterDataXX.java
|
||||
#
|
||||
|
||||
1972
jdk/make/tools/UnicodeData/Scripts.txt
Normal file
1972
jdk/make/tools/UnicodeData/Scripts.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,100 @@
|
||||
package build.tools.generatecharacter;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.*;
|
||||
import java.util.*;
|
||||
import java.util.zip.*;
|
||||
|
||||
public class CharacterName {
|
||||
|
||||
public static void main(String[] args) {
|
||||
FileReader reader = null;
|
||||
try {
|
||||
if (args.length != 2) {
|
||||
System.err.println("Usage: java CharacterName UniocdeData.txt uniName.dat");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
reader = new FileReader(args[0]);
|
||||
BufferedReader bfr = new BufferedReader(reader);
|
||||
String line = null;
|
||||
|
||||
StringBuilder namePool = new StringBuilder();
|
||||
byte[] cpPoolBytes = new byte[0x100000];
|
||||
ByteBuffer cpBB = ByteBuffer.wrap(cpPoolBytes);
|
||||
int lastCp = 0;
|
||||
int cpNum = 0;
|
||||
|
||||
while ((line = bfr.readLine()) != null) {
|
||||
if (line.startsWith("#"))
|
||||
continue;
|
||||
UnicodeSpec spec = UnicodeSpec.parse(line);
|
||||
if (spec != null) {
|
||||
int cp = spec.getCodePoint();
|
||||
String name = spec.getName();
|
||||
cpNum++;
|
||||
if (name.equals("<control>") && spec.getOldName() != null) {
|
||||
if (spec.getOldName().length() != 0)
|
||||
name = spec.getOldName();
|
||||
else
|
||||
continue;
|
||||
} else if (name.startsWith("<")) {
|
||||
/*
|
||||
3400 <CJK Ideograph Extension A, First>
|
||||
4db5 <CJK Ideograph Extension A, Last>
|
||||
4e00 <CJK Ideograph, First>
|
||||
9fc3 <CJK Ideograph, Last>
|
||||
ac00 <Hangul Syllable, First>
|
||||
d7a3 <Hangul Syllable, Last>
|
||||
d800 <Non Private Use High Surrogate, First>
|
||||
db7f <Non Private Use High Surrogate, Last>
|
||||
db80 <Private Use High Surrogate, First>
|
||||
dbff <Private Use High Surrogate, Last>
|
||||
dc00 <Low Surrogate, First>
|
||||
dfff <Low Surrogate, Last>
|
||||
e000 <Private Use, First>
|
||||
f8ff <Private Use, Last>
|
||||
20000 <CJK Ideograph Extension B, First>
|
||||
2a6d6 <CJK Ideograph Extension B, Last>
|
||||
f0000 <Plane 15 Private Use, First>
|
||||
ffffd <Plane 15 Private Use, Last>
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cp == lastCp + 1) {
|
||||
cpBB.put((byte)name.length());
|
||||
} else {
|
||||
cpBB.put((byte)0); // segment start flag
|
||||
cpBB.putInt((name.length() << 24) | (cp & 0xffffff));
|
||||
}
|
||||
namePool.append(name);
|
||||
lastCp = cp;
|
||||
}
|
||||
}
|
||||
|
||||
byte[] namePoolBytes = namePool.toString().getBytes("ASCII");
|
||||
int cpLen = cpBB.position();
|
||||
int total = cpLen + namePoolBytes.length;
|
||||
|
||||
DataOutputStream dos = new DataOutputStream(
|
||||
new DeflaterOutputStream(
|
||||
new FileOutputStream(args[1])));
|
||||
dos.writeInt(total); // total
|
||||
dos.writeInt(cpLen); // nameOff
|
||||
dos.write(cpPoolBytes, 0, cpLen);
|
||||
dos.write(namePoolBytes);
|
||||
dos.close();
|
||||
|
||||
} catch (Throwable e) {
|
||||
System.out.println("Unexpected exception:");
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (reader != null) {
|
||||
try {
|
||||
reader.close();
|
||||
} catch (Throwable ee) { ee.printStackTrace(); }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,214 @@
|
||||
import java.util.regex.*;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
public class CharacterScript {
|
||||
|
||||
// generate the code needed for j.l.C.UnicodeScript
|
||||
static void fortest(String fmt, Object... o) {
|
||||
//System.out.printf(fmt, o);
|
||||
}
|
||||
|
||||
static void print(String fmt, Object... o) {
|
||||
System.out.printf(fmt, o);
|
||||
}
|
||||
|
||||
static void debug(String fmt, Object... o) {
|
||||
//System.out.printf(fmt, o);
|
||||
}
|
||||
|
||||
public static void main(String args[]){
|
||||
try {
|
||||
if (args.length != 1) {
|
||||
System.out.println("java CharacterScript script.txt out");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
int i, j;
|
||||
BufferedReader sbfr = new BufferedReader(new FileReader(args[0]));
|
||||
HashMap<String,Integer> scriptMap = new HashMap<String,Integer>();
|
||||
String line = null;
|
||||
|
||||
Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s+;\\s+(\\w+)\\s+#.*").matcher("");
|
||||
|
||||
int prevS = -1;
|
||||
int prevE = -1;
|
||||
String prevN = null;
|
||||
int[][] scripts = new int[1024][3];
|
||||
int scriptSize = 0;
|
||||
|
||||
while ((line = sbfr.readLine()) != null) {
|
||||
if (line.length() <= 1 || line.charAt(0) == '#') {
|
||||
continue;
|
||||
}
|
||||
m.reset(line);
|
||||
if (m.matches()) {
|
||||
int start = Integer.parseInt(m.group(1), 16);
|
||||
int end = (m.group(2)==null)?start
|
||||
:Integer.parseInt(m.group(2), 16);
|
||||
String name = m.group(3);
|
||||
if (name.equals(prevN) && start == prevE + 1) {
|
||||
prevE = end;
|
||||
} else {
|
||||
if (prevS != -1) {
|
||||
if (scriptMap.get(prevN) == null) {
|
||||
scriptMap.put(prevN, scriptMap.size());
|
||||
}
|
||||
scripts[scriptSize][0] = prevS;
|
||||
scripts[scriptSize][1] = prevE;
|
||||
scripts[scriptSize][2] = scriptMap.get(prevN);
|
||||
scriptSize++;
|
||||
}
|
||||
debug("%x-%x\t%s%n", prevS, prevE, prevN);
|
||||
prevS = start; prevE = end; prevN = name;
|
||||
}
|
||||
} else {
|
||||
debug("Warning: Unrecognized line <%s>%n", line);
|
||||
}
|
||||
}
|
||||
|
||||
//last one.
|
||||
if (scriptMap.get(prevN) == null) {
|
||||
scriptMap.put(prevN, scriptMap.size());
|
||||
}
|
||||
scripts[scriptSize][0] = prevS;
|
||||
scripts[scriptSize][1] = prevE;
|
||||
scripts[scriptSize][2] = scriptMap.get(prevN);
|
||||
scriptSize++;
|
||||
|
||||
debug("%x-%x\t%s%n", prevS, prevE, prevN);
|
||||
debug("-----------------%n");
|
||||
debug("Total scripts=%s%n", scriptMap.size());
|
||||
debug("-----------------%n%n");
|
||||
|
||||
String[] names = new String[scriptMap.size()];
|
||||
for (String name: scriptMap.keySet()) {
|
||||
names[scriptMap.get(name).intValue()] = name;
|
||||
}
|
||||
|
||||
for (j = 0; j < scriptSize; j++) {
|
||||
for (int cp = scripts[j][0]; cp <= scripts[j][1]; cp++) {
|
||||
String name = names[scripts[j][2]].toUpperCase(Locale.ENGLISH);;
|
||||
if (cp > 0xffff)
|
||||
System.out.printf("%05X %s%n", cp, name);
|
||||
else
|
||||
System.out.printf("%05X %s%n", cp, name);
|
||||
}
|
||||
}
|
||||
|
||||
Arrays.sort(scripts, 0, scriptSize,
|
||||
new Comparator<int[]>() {
|
||||
public int compare(int[] a1, int[] a2) {
|
||||
return a1[0] - a2[0];
|
||||
}
|
||||
public boolean compare(Object obj) {
|
||||
return obj == this;
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
|
||||
// Consolidation: there are lots of "reserved" code points
|
||||
// embedded in those otherwise "sequential" blocks.
|
||||
// To make the lookup table smaller, we combine those
|
||||
// separated segments with the assumption that the lookup
|
||||
// implementation checks
|
||||
// Character.getType() != Character.UNASSIGNED
|
||||
// first (return UNKNOWN for unassigned)
|
||||
|
||||
ArrayList<int[]> list = new ArrayList();
|
||||
list.add(scripts[0]);
|
||||
|
||||
int[] last = scripts[0];
|
||||
for (i = 1; i < scriptSize; i++) {
|
||||
if (scripts[i][0] != (last[1] + 1)) {
|
||||
|
||||
boolean isNotUnassigned = false;
|
||||
for (int cp = last[1] + 1; cp < scripts[i][0]; cp++) {
|
||||
if (Character.getType(cp) != Character.UNASSIGNED) {
|
||||
isNotUnassigned = true;
|
||||
debug("Warning: [%x] is ASSIGNED but in NON script%n", cp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (isNotUnassigned) {
|
||||
// surrogates only?
|
||||
int[] a = new int[3];
|
||||
a[0] = last[1] + 1;
|
||||
a[1] = scripts[i][0] - 1;
|
||||
a[2] = -1; // unknown
|
||||
list.add(a);
|
||||
} else {
|
||||
if (last[2] == scripts[i][2]) {
|
||||
//combine
|
||||
last[1] = scripts[i][1];
|
||||
continue;
|
||||
} else {
|
||||
// expand last
|
||||
last[1] = scripts[i][0] - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
list.add(scripts[i]);
|
||||
last = scripts[i];
|
||||
}
|
||||
|
||||
for (i = 0; i < list.size(); i++) {
|
||||
int[] a = (int[])list.get(i);
|
||||
String name = "UNKNOWN";
|
||||
if (a[2] != -1)
|
||||
name = names[a[2]].toUpperCase(Locale.US);
|
||||
debug("0x%05x, 0x%05x %s%n", a[0], a[1], name);
|
||||
}
|
||||
debug("--->total=%d%n", list.size());
|
||||
|
||||
|
||||
//////////////////OUTPUT//////////////////////////////////
|
||||
print("public class Scripts {%n%n");
|
||||
print(" public static enum UnicodeScript {%n");
|
||||
for (i = 0; i < names.length; i++) {
|
||||
print(" /**%n * Unicode script \"%s\".%n */%n", names[i]);
|
||||
print(" %s,%n%n", names[i].toUpperCase(Locale.US));
|
||||
}
|
||||
print(" /**%n * Unicode script \"Unknown\".%n */%n UNKNOWN;%n%n");
|
||||
|
||||
|
||||
// lookup table
|
||||
print(" private static final int[] scriptStarts = {%n");
|
||||
for (int[] a : list) {
|
||||
String name = "UNKNOWN";
|
||||
if (a[2] != -1)
|
||||
name = names[a[2]].toUpperCase(Locale.US);
|
||||
if (a[0] < 0x10000)
|
||||
print(" 0x%04X, // %04X..%04X; %s%n",
|
||||
a[0], a[0], a[1], name);
|
||||
else
|
||||
print(" 0x%05X, // %05X..%05X; %s%n",
|
||||
a[0], a[0], a[1], name);
|
||||
}
|
||||
last = list.get(list.size() -1);
|
||||
if (last[1] != Character.MAX_CODE_POINT)
|
||||
print(" 0x%05X // %05X..%06X; %s%n",
|
||||
last[1] + 1, last[1] + 1, Character.MAX_CODE_POINT,
|
||||
"UNKNOWN");
|
||||
print("%n };%n%n");
|
||||
|
||||
print(" private static final UnicodeScript[] scripts = {%n");
|
||||
for (int[] a : list) {
|
||||
String name = "UNKNOWN";
|
||||
if (a[2] != -1)
|
||||
name = names[a[2]].toUpperCase(Locale.US);
|
||||
print(" %s,%n", name);
|
||||
}
|
||||
|
||||
if (last[1] != Character.MAX_CODE_POINT)
|
||||
print(" UNKNOWN%n");
|
||||
print(" };%n");
|
||||
print(" }%n");
|
||||
print("}%n");
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -35,6 +35,8 @@ import java.io.BufferedWriter;
|
||||
import java.io.FileWriter;
|
||||
import java.io.File;
|
||||
|
||||
import build.tools.generatecharacter.CharacterName;
|
||||
|
||||
/**
|
||||
* This program generates the source code for the class java.lang.Character.
|
||||
* It also generates native C code that can perform the same operations.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
106
jdk/src/share/classes/java/lang/CharacterName.java
Normal file
106
jdk/src/share/classes/java/lang/CharacterName.java
Normal file
@ -0,0 +1,106 @@
|
||||
/*
|
||||
* Copyright 2010 Sun Microsystems, Inc. All Rights Reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Sun designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Sun in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
|
||||
* CA 95054 USA or visit www.sun.com if you need additional information or
|
||||
* have any questions.
|
||||
*/
|
||||
|
||||
package java.lang;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.lang.ref.SoftReference;
|
||||
import java.util.Arrays;
|
||||
import java.util.zip.InflaterInputStream;
|
||||
import java.security.AccessController;
|
||||
import java.security.PrivilegedAction;
|
||||
|
||||
class CharacterName {
|
||||
|
||||
private static SoftReference<byte[]> refStrPool;
|
||||
private static int[][] lookup;
|
||||
|
||||
private static synchronized byte[] initNamePool() {
|
||||
byte[] strPool = null;
|
||||
if (refStrPool != null && (strPool = refStrPool.get()) != null)
|
||||
return strPool;
|
||||
DataInputStream dis = null;
|
||||
try {
|
||||
dis = new DataInputStream(new InflaterInputStream(
|
||||
AccessController.doPrivileged(new PrivilegedAction<InputStream>()
|
||||
{
|
||||
public InputStream run() {
|
||||
return getClass().getResourceAsStream("uniName.dat");
|
||||
}
|
||||
})));
|
||||
|
||||
lookup = new int[(Character.MAX_CODE_POINT + 1) >> 8][];
|
||||
int total = dis.readInt();
|
||||
int cpEnd = dis.readInt();
|
||||
byte ba[] = new byte[cpEnd];
|
||||
dis.readFully(ba);
|
||||
|
||||
int nameOff = 0;
|
||||
int cpOff = 0;
|
||||
int cp = 0;
|
||||
do {
|
||||
int len = ba[cpOff++] & 0xff;
|
||||
if (len == 0) {
|
||||
len = ba[cpOff++] & 0xff;
|
||||
// always big-endian
|
||||
cp = ((ba[cpOff++] & 0xff) << 16) |
|
||||
((ba[cpOff++] & 0xff) << 8) |
|
||||
((ba[cpOff++] & 0xff));
|
||||
} else {
|
||||
cp++;
|
||||
}
|
||||
int hi = cp >> 8;
|
||||
if (lookup[hi] == null) {
|
||||
lookup[hi] = new int[0x100];
|
||||
}
|
||||
lookup[hi][cp&0xff] = (nameOff << 8) | len;
|
||||
nameOff += len;
|
||||
} while (cpOff < cpEnd);
|
||||
strPool = new byte[total - cpEnd];
|
||||
dis.readFully(strPool);
|
||||
refStrPool = new SoftReference<byte[]>(strPool);
|
||||
} catch (Exception x) {
|
||||
throw new InternalError(x.getMessage());
|
||||
} finally {
|
||||
try {
|
||||
if (dis != null)
|
||||
dis.close();
|
||||
} catch (Exception xx) {}
|
||||
}
|
||||
return strPool;
|
||||
}
|
||||
|
||||
public static String get(int cp) {
|
||||
byte[] strPool = null;
|
||||
if (refStrPool == null || (strPool = refStrPool.get()) == null)
|
||||
strPool = initNamePool();
|
||||
int off = 0;
|
||||
if (lookup[cp>>8] == null ||
|
||||
(off = lookup[cp>>8][cp&0xff]) == 0)
|
||||
return null;
|
||||
return new String(strPool, 0, off >>> 8, off & 0xff); // ASCII
|
||||
}
|
||||
}
|
||||
@ -29,6 +29,7 @@ import java.security.AccessController;
|
||||
import java.security.PrivilegedAction;
|
||||
import java.text.CharacterIterator;
|
||||
import java.text.Normalizer;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
@ -200,8 +201,9 @@ import java.util.Arrays;
|
||||
* <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
|
||||
*
|
||||
* <tr><th> </th></tr>
|
||||
* <tr align="left"><th colspan="2" id="unicode">Classes for Unicode blocks and categories</th></tr>
|
||||
*
|
||||
* <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks and categories</th></tr>
|
||||
* * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
|
||||
* <td headers="matches">A Latin script character (simple <a href="#ubc">script</a>)</td></tr>
|
||||
* <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
|
||||
* <td headers="matches">A character in the Greek block (simple <a href="#ubc">block</a>)</td></tr>
|
||||
* <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
|
||||
@ -527,25 +529,40 @@ import java.util.Arrays;
|
||||
* while not equal, compile into the same pattern, which matches the character
|
||||
* with hexadecimal value <tt>0x2014</tt>.
|
||||
*
|
||||
* <a name="ubc"> <p>Unicode blocks and categories are written with the
|
||||
* <tt>\p</tt> and <tt>\P</tt> constructs as in
|
||||
* Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if the input has the
|
||||
* property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt> does not match if
|
||||
* the input has that property. Blocks are specified with the prefix
|
||||
* <tt>In</tt>, as in <tt>InMongolian</tt>. Categories may be specified with
|
||||
* the optional prefix <tt>Is</tt>: Both <tt>\p{L}</tt> and <tt>\p{IsL}</tt>
|
||||
* denote the category of Unicode letters. Blocks and categories can be used
|
||||
* both inside and outside of a character class.
|
||||
*
|
||||
* <a name="ubc">
|
||||
* <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
|
||||
* <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
|
||||
* the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
|
||||
* does not match if the input has that property.
|
||||
* <p>
|
||||
* Scripts are specified either with the prefix {@code Is}, as in
|
||||
* {@code IsHiragana}, or by using the {@code script} keyword (or its short
|
||||
* form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
|
||||
* <p>
|
||||
* Blocks are specified with the prefix {@code In}, as in
|
||||
* {@code InMongolian}, or by using the keyword {@code block} (or its short
|
||||
* form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
|
||||
* <p>
|
||||
* Categories may be specified with the optional prefix {@code Is}:
|
||||
* Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
|
||||
* letters. Same as scripts and blocks, categories can also be specified
|
||||
* by using the keyword {@code general_category} (or its short form
|
||||
* {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
|
||||
* <p>
|
||||
* Scripts, blocks and categories can be used both inside and outside of a
|
||||
* character class.
|
||||
* <p> The supported categories are those of
|
||||
* <a href="http://www.unicode.org/unicode/standard/standard.html">
|
||||
* <i>The Unicode Standard</i></a> in the version specified by the
|
||||
* {@link java.lang.Character Character} class. The category names are those
|
||||
* defined in the Standard, both normative and informative.
|
||||
* The script names supported by <code>Pattern</code> are the valid script names
|
||||
* accepted and defined by
|
||||
* {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
|
||||
* The block names supported by <code>Pattern</code> are the valid block names
|
||||
* accepted and defined by
|
||||
* {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
|
||||
*
|
||||
* <p>
|
||||
* <a name="jcc"> <p>Categories that behave like the java.lang.Character
|
||||
* boolean is<i>methodname</i> methods (except for the deprecated ones) are
|
||||
* available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
|
||||
@ -2488,12 +2505,34 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
name = new String(temp, i, j-i-1);
|
||||
}
|
||||
|
||||
if (name.startsWith("In")) {
|
||||
node = unicodeBlockPropertyFor(name.substring(2));
|
||||
int i = name.indexOf('=');
|
||||
if (i != -1) {
|
||||
// property construct \p{name=value}
|
||||
String value = name.substring(i + 1);
|
||||
name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
|
||||
if ("sc".equals(name) || "script".equals(name)) {
|
||||
node = unicodeScriptPropertyFor(value);
|
||||
} else if ("blk".equals(name) || "block".equals(name)) {
|
||||
node = unicodeBlockPropertyFor(value);
|
||||
} else if ("gc".equals(name) || "general_category".equals(name)) {
|
||||
node = charPropertyNodeFor(value);
|
||||
} else {
|
||||
throw error("Unknown Unicode property {name=<" + name + ">, "
|
||||
+ "value=<" + value + ">}");
|
||||
}
|
||||
} else {
|
||||
if (name.startsWith("Is"))
|
||||
if (name.startsWith("In")) {
|
||||
// \p{inBlockName}
|
||||
node = unicodeBlockPropertyFor(name.substring(2));
|
||||
} else if (name.startsWith("Is")) {
|
||||
// \p{isGeneralCategory} and \p{isScriptName}
|
||||
name = name.substring(2);
|
||||
node = charPropertyNodeFor(name);
|
||||
node = CharPropertyNames.charPropertyFor(name);
|
||||
if (node == null)
|
||||
node = unicodeScriptPropertyFor(name);
|
||||
} else {
|
||||
node = charPropertyNodeFor(name);
|
||||
}
|
||||
}
|
||||
if (maybeComplement) {
|
||||
if (node instanceof Category || node instanceof Block)
|
||||
@ -2503,6 +2542,21 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
return node;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a CharProperty matching all characters belong to
|
||||
* a UnicodeScript.
|
||||
*/
|
||||
private CharProperty unicodeScriptPropertyFor(String name) {
|
||||
final Character.UnicodeScript script;
|
||||
try {
|
||||
script = Character.UnicodeScript.forName(name);
|
||||
} catch (IllegalArgumentException iae) {
|
||||
throw error("Unknown character script name {" + name + "}");
|
||||
}
|
||||
return new Script(script);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a CharProperty matching all characters in a UnicodeBlock.
|
||||
*/
|
||||
@ -3566,6 +3620,19 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Node class that matches a Unicode script
|
||||
*/
|
||||
static final class Script extends CharProperty {
|
||||
final Character.UnicodeScript script;
|
||||
Script(Character.UnicodeScript script) {
|
||||
this.script = script;
|
||||
}
|
||||
boolean isSatisfiedBy(int ch) {
|
||||
return script == Character.UnicodeScript.of(ch);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Node class that matches a Unicode category.
|
||||
*/
|
||||
|
||||
105
jdk/test/java/lang/Character/CheckScript.java
Normal file
105
jdk/test/java/lang/Character/CheckScript.java
Normal file
@ -0,0 +1,105 @@
|
||||
/**
|
||||
* @test
|
||||
* @bug 6945564
|
||||
* @summary Check that the j.l.Character.UnicodeScript
|
||||
* @ignore don't run until #6903266 is integrated
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
import java.lang.reflect.*;
|
||||
import java.util.*;
|
||||
import java.util.regex.*;
|
||||
import java.lang.Character.UnicodeScript;
|
||||
|
||||
public class CheckScript {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
if (args.length != 1) {
|
||||
System.out.println("java CharacterScript script.txt");
|
||||
System.exit(1);
|
||||
}
|
||||
BufferedReader sbfr = new BufferedReader(new FileReader(args[0]));
|
||||
Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s+;\\s+(\\w+)\\s+#.*").matcher("");
|
||||
String line = null;
|
||||
HashMap<String,ArrayList<Integer>> scripts = new HashMap<>();
|
||||
while ((line = sbfr.readLine()) != null) {
|
||||
if (line.length() <= 1 || line.charAt(0) == '#') {
|
||||
continue;
|
||||
}
|
||||
m.reset(line);
|
||||
if (m.matches()) {
|
||||
int start = Integer.parseInt(m.group(1), 16);
|
||||
int end = (m.group(2)==null)?start
|
||||
:Integer.parseInt(m.group(2), 16);
|
||||
String name = m.group(3).toLowerCase(Locale.ENGLISH);
|
||||
ArrayList<Integer> ranges = scripts.get(name);
|
||||
if (ranges == null) {
|
||||
ranges = new ArrayList<Integer>();
|
||||
scripts.put(name, ranges);
|
||||
}
|
||||
ranges.add(start);
|
||||
ranges.add(end);
|
||||
}
|
||||
}
|
||||
sbfr.close();
|
||||
// check all defined ranges
|
||||
Integer[] ZEROSIZEARRAY = new Integer[0];
|
||||
for (String name : scripts.keySet()) {
|
||||
System.out.println("Checking " + name + "...");
|
||||
Integer[] ranges = scripts.get(name).toArray(ZEROSIZEARRAY);
|
||||
Character.UnicodeScript expected =
|
||||
Character.UnicodeScript.forName(name);
|
||||
|
||||
int off = 0;
|
||||
while (off < ranges.length) {
|
||||
int start = ranges[off++];
|
||||
int end = ranges[off++];
|
||||
for (int cp = start; cp <= end; cp++) {
|
||||
Character.UnicodeScript script =
|
||||
Character.UnicodeScript.of(cp);
|
||||
if (script != expected) {
|
||||
throw new RuntimeException(
|
||||
"UnicodeScript failed: cp=" +
|
||||
Integer.toHexString(cp) +
|
||||
", of(cp)=<" + script + "> but <" +
|
||||
expected + "> is expected");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// check all codepoints
|
||||
for (int cp = 0; cp < Character.MAX_CODE_POINT; cp++) {
|
||||
Character.UnicodeScript script = Character.UnicodeScript.of(cp);
|
||||
if (script == Character.UnicodeScript.UNKNOWN) {
|
||||
if (Character.getType(cp) != Character.UNASSIGNED &&
|
||||
Character.getType(cp) != Character.SURROGATE &&
|
||||
Character.getType(cp) != Character.PRIVATE_USE)
|
||||
throw new RuntimeException(
|
||||
"UnicodeScript failed: cp=" +
|
||||
Integer.toHexString(cp) +
|
||||
", of(cp)=<" + script + "> but UNKNOWN is expected");
|
||||
} else {
|
||||
Integer[] ranges =
|
||||
scripts.get(script.name().toLowerCase(Locale.ENGLISH))
|
||||
.toArray(ZEROSIZEARRAY);
|
||||
int off = 0;
|
||||
boolean found = false;
|
||||
while (off < ranges.length) {
|
||||
int start = ranges[off++];
|
||||
int end = ranges[off++];
|
||||
if (cp >= start && cp <= end)
|
||||
found = true;
|
||||
}
|
||||
if (!found) {
|
||||
throw new RuntimeException(
|
||||
"UnicodeScript failed: cp=" +
|
||||
Integer.toHexString(cp) +
|
||||
", of(cp)=<" + script +
|
||||
"> but NOT in ranges of this script");
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
1972
jdk/test/java/lang/Character/Scripts.txt
Normal file
1972
jdk/test/java/lang/Character/Scripts.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -32,7 +32,7 @@
|
||||
* 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
|
||||
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
|
||||
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
|
||||
* 6350801 6676425 6878475 6919132 6931676
|
||||
* 6350801 6676425 6878475 6919132 6931676 6948903
|
||||
*/
|
||||
|
||||
import java.util.regex.*;
|
||||
@ -135,7 +135,7 @@ public class RegExTest {
|
||||
surrogatesInClassTest();
|
||||
namedGroupCaptureTest();
|
||||
nonBmpClassComplementTest();
|
||||
|
||||
unicodePropertiesTest();
|
||||
if (failure)
|
||||
throw new RuntimeException("Failure in the RE handling.");
|
||||
else
|
||||
@ -3515,7 +3515,7 @@ public class RegExTest {
|
||||
report("NamedGroupCapture");
|
||||
}
|
||||
|
||||
// This is for bug 6919132
|
||||
// This is for bug 6969132
|
||||
private static void nonBmpClassComplementTest() throws Exception {
|
||||
Pattern p = Pattern.compile("\\P{Lu}");
|
||||
Matcher m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
|
||||
@ -3539,4 +3539,79 @@ public class RegExTest {
|
||||
report("NonBmpClassComplement");
|
||||
}
|
||||
|
||||
private static void unicodePropertiesTest() throws Exception {
|
||||
// different forms
|
||||
if (!Pattern.compile("\\p{IsLu}").matcher("A").matches() ||
|
||||
!Pattern.compile("\\p{Lu}").matcher("A").matches() ||
|
||||
!Pattern.compile("\\p{gc=Lu}").matcher("A").matches() ||
|
||||
!Pattern.compile("\\p{general_category=Lu}").matcher("A").matches() ||
|
||||
!Pattern.compile("\\p{IsLatin}").matcher("B").matches() ||
|
||||
!Pattern.compile("\\p{sc=Latin}").matcher("B").matches() ||
|
||||
!Pattern.compile("\\p{script=Latin}").matcher("B").matches() ||
|
||||
!Pattern.compile("\\p{InBasicLatin}").matcher("c").matches() ||
|
||||
!Pattern.compile("\\p{blk=BasicLatin}").matcher("c").matches() ||
|
||||
!Pattern.compile("\\p{block=BasicLatin}").matcher("c").matches())
|
||||
failCount++;
|
||||
|
||||
Matcher common = Pattern.compile("\\p{script=Common}").matcher("");
|
||||
Matcher unknown = Pattern.compile("\\p{IsUnknown}").matcher("");
|
||||
Matcher lastSM = common;
|
||||
Character.UnicodeScript lastScript = Character.UnicodeScript.of(0);
|
||||
|
||||
Matcher latin = Pattern.compile("\\p{block=basic_latin}").matcher("");
|
||||
Matcher greek = Pattern.compile("\\p{InGreek}").matcher("");
|
||||
Matcher lastBM = latin;
|
||||
Character.UnicodeBlock lastBlock = Character.UnicodeBlock.of(0);
|
||||
|
||||
for (int cp = 1; cp < Character.MAX_CODE_POINT; cp++) {
|
||||
if (cp >= 0x30000 && (cp & 0x70) == 0){
|
||||
continue; // only pick couple code points, they are the same
|
||||
}
|
||||
|
||||
// Unicode Script
|
||||
Character.UnicodeScript script = Character.UnicodeScript.of(cp);
|
||||
Matcher m;
|
||||
String str = new String(Character.toChars(cp));
|
||||
if (script == lastScript) {
|
||||
m = lastSM;
|
||||
m.reset(str);
|
||||
} else {
|
||||
m = Pattern.compile("\\p{Is" + script.name() + "}").matcher(str);
|
||||
}
|
||||
if (!m.matches()) {
|
||||
failCount++;
|
||||
}
|
||||
Matcher other = (script == Character.UnicodeScript.COMMON)? unknown : common;
|
||||
other.reset(str);
|
||||
if (other.matches()) {
|
||||
failCount++;
|
||||
}
|
||||
lastSM = m;
|
||||
lastScript = script;
|
||||
|
||||
// Unicode Block
|
||||
Character.UnicodeBlock block = Character.UnicodeBlock.of(cp);
|
||||
if (block == null) {
|
||||
//System.out.printf("Not a Block: cp=%x%n", cp);
|
||||
continue;
|
||||
}
|
||||
if (block == lastBlock) {
|
||||
m = lastBM;
|
||||
m.reset(str);
|
||||
} else {
|
||||
m = Pattern.compile("\\p{block=" + block.toString() + "}").matcher(str);
|
||||
}
|
||||
if (!m.matches()) {
|
||||
failCount++;
|
||||
}
|
||||
other = (block == Character.UnicodeBlock.BASIC_LATIN)? greek : latin;
|
||||
other.reset(str);
|
||||
if (other.matches()) {
|
||||
failCount++;
|
||||
}
|
||||
lastBM = m;
|
||||
lastBlock = block;
|
||||
}
|
||||
report("unicodeProperties");
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user