java判断中文字符
程序员文章站
2022-03-24 19:58:16
...
//代码来自HanLP自然语言处理库,git地址:https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/utility/TextUtility.java
/**
* 文本工具类
*/
public class TextUtility
{
/**
* 单字节
*/
public static final int CT_SINGLE = 5;// SINGLE byte
/**
* 分隔符"!,.?()[]{}+=
*/
public static final int CT_DELIMITER = CT_SINGLE + 1;// delimiter
/**
* 中文字符
*/
public static final int CT_CHINESE = CT_SINGLE + 2;// Chinese Char
/**
* 字母
*/
public static final int CT_LETTER = CT_SINGLE + 3;// HanYu Pinyin
/**
* 数字
*/
public static final int CT_NUM = CT_SINGLE + 4;// HanYu Pinyin
/**
* 序号
*/
public static final int CT_INDEX = CT_SINGLE + 5;// HanYu Pinyin
/**
* 中文数字
*/
public static final int CT_CNUM = CT_SINGLE + 6;
/**
* 其他
*/
public static final int CT_OTHER = CT_SINGLE + 12;// Other
public static int charType(char c)
{
return charType(String.valueOf(c));
}
/**
* 判断字符类型
* @param str
* @return
*/
public static int charType(String str)
{
if (str != null && str.length() > 0)
{
if ("零○〇一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟".contains(str)) return CT_CNUM;
byte[] b;
try
{
b = str.getBytes("GBK");
}
catch (UnsupportedEncodingException e)
{
b = str.getBytes();
e.printStackTrace();
}
byte b1 = b[0];
byte b2 = b.length > 1 ? b[1] : 0;
int ub1 = getUnsigned(b1);
int ub2 = getUnsigned(b2);
if (ub1 < 128)
{
if (ub1 < 32) return CT_DELIMITER; // NON PRINTABLE CHARACTERS
if (' ' == b1) return CT_OTHER;
if ('\n' == b1) return CT_DELIMITER;
if ("*\"!,.?()[]{}+=/\\;:|".indexOf((char) b1) != -1)
return CT_DELIMITER;
if ("0123456789".indexOf((char)b1) != -1)
return CT_NUM;
return CT_SINGLE;
}
else if (ub1 == 162)
return CT_INDEX;
else if (ub1 == 163 && ub2 > 175 && ub2 < 186)
return CT_NUM;
else if (ub1 == 163
&& (ub2 >= 193 && ub2 <= 218 || ub2 >= 225
&& ub2 <= 250))
return CT_LETTER;
else if (ub1 == 161 || ub1 == 163)
return CT_DELIMITER;
else if (ub1 >= 176 && ub1 <= 247)
return CT_CHINESE;
}
return CT_OTHER;
}
/**
* 是否全是中文
* @param str
* @return
*/
public static boolean isAllChinese(String str)
{
return str.matches("[\\u4E00-\\u9FA5]+");
}
/**
* 是否全部不是中文
* @param sString
* @return
*/
public static boolean isAllNonChinese(byte[] sString)
{
int nLen = sString.length;
int i = 0;
while (i < nLen)
{
if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175)
return false;
if (sString[i] < 0)
i += 2;
else
i += 1;
}
return true;
}
/**
* 是否全是单字节
* @param str
* @return
*/
public static boolean isAllSingleByte(String str)
{
assert str != null;
for (int i = 0; i < str.length(); i++)
{
if (str.charAt(i) >128)
{
return false;
}
}
return true;
}
/**
* 把表示数字含义的字符串转成整形
*
* @param str 要转换的字符串
* @return 如果是有意义的整数,则返回此整数值。否则,返回-1。
*/
public static int cint(String str)
{
if (str != null)
try
{
int i = new Integer(str).intValue();
return i;
}
catch (NumberFormatException e)
{
}
return -1;
}
/**
* 是否全是数字
* @param str
* @return
*/
public static boolean isAllNum(String str)
{
if (str == null)
return false;
int i = 0;
/** 判断开头是否是+-之类的符号 */
if ("±+-+-—".indexOf(str.charAt(0)) != -1)
i++;
/** 如果是全角的0123456789 字符* */
while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
i++;
// Get middle delimiter such as .
if (i > 0 && i < str.length())
{
char ch = str.charAt(i);
if ("·∶:,,..//".indexOf(ch) != -1)
{// 98.1%
i++;
while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
i++;
}
}
if (i >= str.length())
return true;
/** 如果是半角的0123456789字符* */
while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
i++;
// Get middle delimiter such as .
if (i > 0 && i < str.length())
{
char ch = str.charAt(i);
if (',' == ch || '.' == ch || '/' == ch || ':' == ch || "∶·,./".indexOf(ch) != -1)
{// 98.1%
i++;
while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
i++;
}
}
if (i < str.length())
{
if ("百千万亿佰仟%%‰".indexOf(str.charAt(i)) != -1)
i++;
}
if (i >= str.length())
return true;
return false;
}
/**
* 是否全是序号
* @param sString
* @return
*/
public static boolean isAllIndex(byte[] sString)
{
int nLen = sString.length;
int i = 0;
while (i < nLen - 1 && getUnsigned(sString[i]) == 162)
{
i += 2;
}
if (i >= nLen)
return true;
while (i < nLen && (sString[i] > 'A' - 1 && sString[i] < 'Z' + 1)
|| (sString[i] > 'a' - 1 && sString[i] < 'z' + 1))
{// single
// byte
// number
// char
i += 1;
}
if (i < nLen)
return false;
return true;
}
/**
* 是否全为英文
*
* @param text
* @return
*/
public static boolean isAllLetter(String text)
{
for (int i = 0; i < text.length(); ++i)
{
char c = text.charAt(i);
if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z'))))
{
return false;
}
}
return true;
}
/**
* 是否全为英文或字母
*
* @param text
* @return
*/
public static boolean isAllLetterOrNum(String text)
{
for (int i = 0; i < text.length(); ++i)
{
char c = text.charAt(i);
if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z')) && ((c < '0' || c > '9'))))
{
return false;
}
}
return true;
}
/**
* 是否全是分隔符
* @param sString
* @return
*/
public static boolean isAllDelimiter(byte[] sString)
{
int nLen = sString.length;
int i = 0;
while (i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163))
{
i += 2;
}
if (i < nLen)
return false;
return true;
}
/**
* 是否全是中国数字
* @param word
* @return
*/
public static boolean isAllChineseNum(String word)
{// 百分之五点六的人早上八点十八分起床
String chineseNum = "零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟∶·./点";//
String prefix = "几数上第";
String surfix = "几多余来成倍";
boolean round = false;
if (word == null)
return false;
char[] temp = word.toCharArray();
for (int i = 0; i < temp.length; i++)
{
if (word.startsWith("分之", i))// 百分之五
{
i += 1;
continue;
}
char tchar = temp[i];
if (i == 0 && prefix.indexOf(tchar) != -1)
{
round = true;
}
else if (i == temp.length-1 && !round && surfix.indexOf(tchar) != -1)
{
round = true;
}
else if (chineseNum.indexOf(tchar) == -1)
return false;
}
return true;
}
/**
* 得到字符集的字符在字符串中出现的次数
*
* @param charSet
* @param word
* @return
*/
public static int getCharCount(String charSet, String word)
{
int nCount = 0;
if (word != null)
{
String temp = word + " ";
for (int i = 0; i < word.length(); i++)
{
String s = temp.substring(i, i + 1);
if (charSet.indexOf(s) != -1)
nCount++;
}
}
return nCount;
}
/**
* 获取字节对应的无符号整型数
*
* @param b
* @return
*/
public static int getUnsigned(byte b)
{
if (b > 0)
return (int) b;
else
return (b & 0x7F + 128);
}
/**
* 判断字符串是否是年份
*
* @param snum
* @return
*/
public static boolean isYearTime(String snum)
{
if (snum != null)
{
int len = snum.length();
String first = snum.substring(0, 1);
// 1992年, 98年,06年
if (isAllSingleByte(snum)
&& (len == 4 || len == 2 && (cint(first) > 4 || cint(first) == 0)))
return true;
if (isAllNum(snum) && (len >= 3 || len == 2 && "056789".indexOf(first) != -1))
return true;
if (getCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖", snum) == len && len >= 2)
return true;
if (len == 4 && getCharCount("千仟零○", snum) == 2)// 二仟零二年
return true;
if (len == 1 && getCharCount("千仟", snum) == 1)
return true;
if (len == 2 && getCharCount("甲乙丙丁戊己庚辛壬癸", snum) == 1
&& getCharCount("子丑寅卯辰巳午未申酉戌亥", snum.substring(1)) == 1)
return true;
}
return false;
}
/**
* 判断一个字符串的所有字符是否在另一个字符串集合中
*
* @param aggr 字符串集合
* @param str 需要判断的字符串
* @return
*/
public static boolean isInAggregate(String aggr, String str)
{
if (aggr != null && str != null)
{
str += "1";
for (int i = 0; i < str.length(); i++)
{
String s = str.substring(i, i + 1);
if (aggr.indexOf(s) == -1)
return false;
}
return true;
}
return false;
}
/**
* 判断该字符串是否是半角字符
*
* @param str
* @return
*/
public static boolean isDBCCase(String str)
{
if (str != null)
{
str += " ";
for (int i = 0; i < str.length(); i++)
{
String s = str.substring(i, i + 1);
int length = 0;
try
{
length = s.getBytes("GBK").length;
}
catch (UnsupportedEncodingException e)
{
e.printStackTrace();
length = s.getBytes().length;
}
if (length != 1)
return false;
}
return true;
}
return false;
}
/**
* 判断该字符串是否是全角字符
*
* @param str
* @return
*/
public static boolean isSBCCase(String str)
{
if (str != null)
{
str += " ";
for (int i = 0; i < str.length(); i++)
{
String s = str.substring(i, i + 1);
int length = 0;
try
{
length = s.getBytes("GBK").length;
}
catch (UnsupportedEncodingException e)
{
e.printStackTrace();
length = s.getBytes().length;
}
if (length != 2)
return false;
}
return true;
}
return false;
}
/**
* 判断是否是一个连字符(分隔符)
*
* @param str
* @return
*/
public static boolean isDelimiter(String str)
{
if (str != null && ("-".equals(str) || "-".equals(str)))
return true;
else
return false;
}
public static boolean isUnknownWord(String word)
{
if (word != null && word.indexOf("未##") == 0)
return true;
else
return false;
}
/**
* 防止频率为0发生除零错误
*
* @param frequency
* @return
*/
public static double nonZero(double frequency)
{
if (frequency == 0) return 1e-3;
return frequency;
}
/**
* 转换long型为char数组
*
* @param x
*/
public static char[] long2char(long x)
{
char[] c = new char[4];
c[0] = (char) (x >> 48);
c[1] = (char) (x >> 32);
c[2] = (char) (x >> 16);
c[3] = (char) (x);
return c;
}
/**
* 转换long类型为string
*
* @param x
* @return
*/
public static String long2String(long x)
{
char[] cArray = long2char(x);
StringBuilder sbResult = new StringBuilder(cArray.length);
for (char c : cArray)
{
sbResult.append(c);
}
return sbResult.toString();
}
/**
* 将异常转为字符串
*
* @param e
* @return
*/
public static String exceptionToString(Exception e)
{
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
e.printStackTrace(pw);
return sw.toString();
}
/**
* 判断某个字符是否为汉字
*
* @param c 需要判断的字符
* @return 是汉字返回true,否则返回false
*/
public static boolean isChinese(char c)
{
String regex = "[\\u4e00-\\u9fa5]";
return String.valueOf(c).matches(regex);
}
/**
* 统计 keyword 在 srcText 中的出现次数
*
* @param keyword
* @param srcText
* @return
*/
public static int count(String keyword, String srcText)
{
int count = 0;
int leng = srcText.length();
int j = 0;
for (int i = 0; i < leng; i++)
{
if (srcText.charAt(i) == keyword.charAt(j))
{
j++;
if (j == keyword.length())
{
count++;
j = 0;
}
}
else
{
i = i - j;// should rollback when not match
j = 0;
}
}
return count;
}
/**
* 简单好用的写String方式
*
* @param s
* @param out
* @throws IOException
*/
public static void writeString(String s, DataOutputStream out) throws IOException
{
out.writeInt(s.length());
for (char c : s.toCharArray())
{
out.writeChar(c);
}
}
/**
* 判断字符串是否为空(null和空格)
*
* @param cs
* @return
*/
public static boolean isBlank(CharSequence cs)
{
int strLen;
if (cs == null || (strLen = cs.length()) == 0)
{
return true;
}
for (int i = 0; i < strLen; i++)
{
if (!Character.isWhitespace(cs.charAt(i)))
{
return false;
}
}
return true;
}
上一篇: 基于Node开发的KoaHub的静态服务器重写和索引代码
下一篇: 了解Java程序之面向对象