【自然语言实战】·第二章(1.1)——获取词语首字字母
程序员文章站
2024-02-28 11:49:40
...
一、maven依赖
<dependency>
<groupId>net.sourceforge.pinyin4j</groupId>
<artifactId>pinyin4j</artifactId>
<version>2.5.0</version>
</dependency>
二、示例代码
import com.pingan.lcloud.ark.log.LoggerUtil;
import net.sourceforge.pinyin4j.PinyinHelper;
import org.apache.commons.lang3.CharUtils;
import org.apache.commons.lang3.StringUtils;
import java.lang.annotation.Native;
import java.util.Objects;
/**
* <code>Details determine success.</code>
* by Liang ZC., [email protected]
* 中文工具类
*
* @author LIANGZHICHENG035
* @date 2019-11-6 15:57
* @see http://www.stanford.edu
*/
public class ChineseUtils {
/*
* N777777777NO
* N7777777777777N
* M777777777777777N
* *N877777777D77777M
* N M77777777ONND777M
* MN777777777NN D777
* N7ZN777777777NN ~M7778
* N777777777777MMNN88777N
* N777777777777MNZZZ7777O
* DZN7777O77777777777777
* N7OONND7777777D77777N
* 8*M++++?N???$77777$
* M7++++N+M77777777N
* N77O777777777777$ M
* DNNM$$$$777777N D
* N*N:=N$777N7777M NZ
* 77Z::::N777777777 ODZZZ
* 77N::::::N77777777M NNZZZ$
* $777:::::::77777777MN ZM8ZZZZZ
* 777M::::::Z7777777Z77 N++ZZZZNN
* 7777M:::::M7777777$777M $++IZZZZM
* M777$:::::N777777*M7777M +++++ZZZDN
* NN$::::::7777$*M777777N N+++ZZZZNZ
* N::::::N:7*O:77777777 N++++ZZZZN
* M::::::::::::N77777777+ +?+++++ZZZM
* 8::::::::::::D77777777M O+++++ZZ
* ::::::::::::M777777777N O+?D
* M:::::::::::M77777777778 77=
* D=::::::::::N7777777777N 777
* INN===::::::=77777777777N I777N
* ?777N========N7777777777787M N7777
* 77777*D======N77777777777N777N? N777777
* I77777$$*N7===M$$77777777$77777777*MMZ77777777N
* $$$$$$$$$$*NIZN$$$$$$$$*M$$7777777777777777ON
* M$$$$$$$*M M$$$$$$$*N=N$$$$7777777$$*ND
* O77Z$$$$$$$ M$$$$$$$*MNI==*DNNNNM=~N
* 7 :N MNN$$$*M$ $$$777$8 8D8I
* NMM.:7O 777777778
* 7777777MN
* M NO .7:
* M : M
* 8
*/
// Constant matcher factory methods
public ChineseUtils() {
}
private static final String PUNCTUATION = "\\pP";
/***
* <p>get chinese initail, if the first char is number return the number, if it is a polysyllabic character,
* take only the first one, if the chinese initail is empty return {@param defaultValue}.<p/>
*
* <pre>
* ChineseUtils.getChineseInitial("我爱中国")) = W
* ChineseUtils.getChineseInitial("爱中国")) = A
* ChineseUtils.getChineseInitial("1爱中国") = 1
* ChineseUtils.getChineseInitial("中国")) = Z
* ChineseUtils.getChineseInitial("@#国")) = G
* ChineseUtils.getChineseInitial("国%$")) = G
* ChineseUtils.getChineseInitial("国")) = G
* ChineseUtils.getChineseInitial("W我爱中国")) = W
* ChineseUtils.getChineseInitial("I我爱中国")) = I
* ChineseUtils.getChineseInitial("null")) = N
* ChineseUtils.getChineseInitial(null)) = ""
* ChineseUtils.getChineseInitial("")) = ""
* ChineseUtils.getChineseInitial(",")) = ""
* <pre/>
*
* @param chinese
* @param defaultValue
* @return the pinyin of first chinese char,if {@param chinese} is't chinese,return {@link StringUtils.EMPTY}.
*/
public static String getChineseInitialDefaultIfEmpty(String chinese, String defaultValue) {
String result = getChineseInitial(chinese, true);
return StringUtils.isEmpty(result) ? defaultValue : result;
}
/***
* <p>get chinese initail, if the first char is number return the number, if it is a polysyllabic character,
* take only the first one.<p/>
*
* <pre>
* ChineseUtils.getChineseInitial("我爱中国")) = W
* ChineseUtils.getChineseInitial("爱中国")) = A
* ChineseUtils.getChineseInitial("1爱中国") = 1
* ChineseUtils.getChineseInitial("中国")) = Z
* ChineseUtils.getChineseInitial("@#国")) = G
* ChineseUtils.getChineseInitial("国%$")) = G
* ChineseUtils.getChineseInitial("国")) = G
* ChineseUtils.getChineseInitial("W我爱中国")) = W
* ChineseUtils.getChineseInitial("I我爱中国")) = I
* ChineseUtils.getChineseInitial("null")) = N
* ChineseUtils.getChineseInitial(null)) = ""
* ChineseUtils.getChineseInitial("")) = ""
* ChineseUtils.getChineseInitial(",")) = ""
* <pre/>
*
* @param chinese
* @return the pinyin of first chinese char,if {@param chinese} is't chinese,return {@link StringUtils.EMPTY}.
*/
public static String getChineseInitial(String chinese) {
return getChineseInitial(chinese, true);
}
/***
* <p>get chinese initail, if the first char is number return the number, if it is a polysyllabic character,
* take only the first one.<p/>
*
* <pre>
* ChineseUtils.getChineseInitial("我爱中国") = W
* ChineseUtils.getChineseInitial("爱中国") = A
* ChineseUtils.getChineseInitial("1爱中国") = 1
* ChineseUtils.getChineseInitial("中国") = Z
* ChineseUtils.getChineseInitial("@#国") = G
* ChineseUtils.getChineseInitial("国%$" = G
* ChineseUtils.getChineseInitial("国") = G
* ChineseUtils.getChineseInitial("W我爱中国") = W
* ChineseUtils.getChineseInitial("I我爱中国") = I
* ChineseUtils.getChineseInitial("null") = N
* ChineseUtils.getChineseInitial(null) = ""
* ChineseUtils.getChineseInitial("") = ""
* ChineseUtils.getChineseInitial(",") = ""
* <pre/>
*
* @param chinese
* @param removePunctuation is remove the punctuation in {@param chinese}.
* @return the pinyin of first chinese char,if {@param chinese} is't chinese,return {@link StringUtils.EMPTY}.
*/
public static String getChineseInitial(String chinese, boolean removePunctuation) {
// if need remove punctuation.
if (removePunctuation) {
chinese = removePunctuation(chinese);
}
// if chinese is blank
if (StringUtils.isBlank(chinese)) {
return StringUtils.EMPTY;
}
// first char.
char firstChar = chinese.charAt(0);
// if first char in [a-z,A-Z,0-9]
if (CharUtils.isAsciiAlphanumeric(firstChar)) {
return CharUtils.toString(firstChar).toUpperCase();
}
// if is chinese.
boolean isChinese = CharUtils.toString(firstChar).matches("[\u4E00-\u9FA5]+");
if (!isChinese) {
return StringUtils.EMPTY;
}
// chinese to pinyin and get first char.
try {
String[] res = PinyinHelper.toHanyuPinyinStringArray(firstChar);
return res[0].substring(0, 1).toUpperCase();
} catch (Exception e) {
LoggerUtil.warn("get " + chinese + " chinese initial fail.", e);
}
return StringUtils.EMPTY;
}
/**
* <p>replace {@param str} punctuation to "", if {@param str} is empty , return {@link StringUtils.EMPTY}.</p>
*
* <pre>
* ChineseUtils.removePunctuation(null) = ""
* ChineseUtils.removePunctuation("") = ""
* ChineseUtils.removePunctuation(" ") = ""
* ChineseUtils.removePunctuation("我爱中国") = 我爱中国
* ChineseUtils.removePunctuation("我爱中国!") = 我爱中国
* ChineseUtils.removePunctuation("我爱中国。") = 我爱中国
* ChineseUtils.removePunctuation("我爱中国.") = 我爱中国
* ChineseUtils.removePunctuation(" 我爱中国. ") = 我爱中国
* </pre>
*
* @param str
* @return string
*/
public static String removePunctuation(String str) {
if (StringUtils.isEmpty(str)) {
return StringUtils.EMPTY;
}
return str.trim().replaceAll(PUNCTUATION, StringUtils.EMPTY);
}
}
三、运行结果
public static void main(String[] args) {
System.out.println(ChineseUtils.getChineseInitial("我爱中国"));
System.out.println(ChineseUtils.getChineseInitial("爱中国"));
System.out.println(ChineseUtils.getChineseInitial("1爱中国"));
System.out.println(ChineseUtils.getChineseInitial("中国"));
System.out.println(ChineseUtils.getChineseInitial("@#国"));
System.out.println(ChineseUtils.getChineseInitial("国%$"));
System.out.println(ChineseUtils.getChineseInitial("国"));
System.out.println(ChineseUtils.getChineseInitial("W我爱中国"));
System.out.println(ChineseUtils.getChineseInitial("I我爱中国"));
System.out.println(ChineseUtils.getChineseInitial("null"));
System.out.println(ChineseUtils.getChineseInitial(null));
System.out.println(ChineseUtils.getChineseInitial(""));
System.out.println(ChineseUtils.getChineseInitial(","));
}
W
A
1
Z
G
G
G
W
I
N
上一篇: kerberos安装配置与使用