互联网时代的社会语言学:基于SNS的文本数据挖掘 博客分类: 大数据处理自然语言处理 数据挖掘互联网sns
程序员文章站
2024-02-08 14:33:28
...
互联网时代的社会语言学:基于SNS的文本数据挖掘
本文转载于http://www.matrix67.com/blog/archives/5044
几个概念
凝固度
我们定义“电影院”的凝合程度就是 p(电影院) 与 p(电) · p(影院) 比值和 p(电影院) 与 p(电影) · p(院) 的比值中的较小值,“的电影”的凝合程度则是 p(的电影) 分别除以 p(的) · p(电影) 和 p(的电) · p(影) 所得的商的较小值。
*度
我们不妨就把一个文本片段的*运用程度定义为它的左邻字信息熵和右邻字信息熵中的较小值。
java实现,100M文本效果还可以,但大于100M以后内存会溢出
本文转载于http://www.matrix67.com/blog/archives/5044
几个概念
凝固度
我们定义“电影院”的凝合程度就是 p(电影院) 与 p(电) · p(影院) 比值和 p(电影院) 与 p(电影) · p(院) 的比值中的较小值,“的电影”的凝合程度则是 p(的电影) 分别除以 p(的) · p(电影) 和 p(的电) · p(影) 所得的商的较小值。
*度
我们不妨就把一个文本片段的*运用程度定义为它的左邻字信息熵和右邻字信息熵中的较小值。
java实现,100M文本效果还可以,但大于100M以后内存会溢出
public class FindWordsByWordArray { private final static ResourceBundle resourceBundle = ResourceBundle.getBundle("finder"); private Map<String, Word> wordsMap = new HashMap<String, Word>(); private int wordMaxLen = 5; private double allTextLen = 0; private double allDomSize = 0; private double mutualInformationPunish = 0.5; private double leftAndRightEntropyPunish = 1; private double wholePunish = 10; public FindWordsByWordArray() { } public FindWordsByWordArray(long num) { this.allDomSize = num; } public static long pretreatment(File input, File output) throws IOException { if(output.exists()) FileUtils.deleteQuietly(output); LineIterator list = FileUtils.lineIterator(input, "utf-8"); List<String> res = new ArrayList<String>(); long num = 0; for (String text = list.next(); list.hasNext(); text = list.next()) { num++; res.addAll(pretreatment(text)); if(res.size() > 500000) { FileUtils.writeLines(output, res, true); System.out.println("write lines 500000."); res.clear(); } } list.close(); if(res.size() > 0) FileUtils.writeLines(output, res, true); System.out.println("pretreatment over."); return num; } private static List<String> pretreatment(String... texts) { List<String> res = new ArrayList<String>(); for(String text: texts) { text = text.toLowerCase().replaceAll("\\d", "N") .replaceAll("(\\p{P}|\\s+|&[a-zA-Z]*;|[a-zA-z]+://[^\\s]*|~|~|★)", "#") .replace('.', '#') .replace('+', '#') .replace('|', '#') .replace('>', '#'); for (String some : text.split("#")) { if (some.length() < 5) continue; res.add(some); } } return res; } public void parse(boolean needPretreatment, String... texts) { if(needPretreatment) { allDomSize += texts.length; parse(false, pretreatment(texts)); return; } for (String text : texts) { if (text.matches("^[a-zA-Z]*")) { parseEnglish(text); allTextLen += 1; }else { parseChinese(text); allTextLen += text.length(); } } } private void parseEnglish(String text) { addEnglishWord(text); } private void parseChinese(String text) { WordArray wordArray = new WordArray(text); String left = null; int thisWordMaxLen = wordMaxLen; for (int index = 0, textLen = wordArray.wordLen(); index < textLen - 1; index++) { for (int i = 2; i <= thisWordMaxLen; i++) { int toIndex = index + i; if (toIndex > textLen) break; String word = wordArray.subWords(index, toIndex); addWord(word); if (left != null) wordsMap.get(word).leftAdd(left); if (toIndex + 1 <= textLen) wordsMap.get(word).rightAdd(wordArray.subWords(toIndex, toIndex + 1)); } left = wordArray.subWords(index, index + 1); } for (String s : wordArray.getChineseWords()) { addWord(s); } for (String s : wordArray.getEnglishWords()) { addEnglishWord(s); } } private void addWord(String word) { if (word.length() == 0) throw new IllegalArgumentException("word length is 0."); if (wordsMap.containsKey(word)) wordsMap.get(word).getTf().incrementAndGet(); else wordsMap.put(word, new Word(word)); } private void addEnglishWord(String word) { addWord(word); wordsMap.get(word).setAllEnglish(true); } public void parse(boolean needPretreatment, Collection<String> texts) { parse(needPretreatment, texts.toArray(new String[texts.size()])); } public List<String> print() { return print(getRes()); } public List<String> print(List<Word> words) { List<String> res = new ArrayList<String>(); for (Word word : words) { res.add(word.toTab()); } return res; } public List<Word> getRes() { List<Word> words = new ArrayList<Word>(wordsMap.values()); words = Lists.newArrayList(Collections2.filter(words, new Predicate<Word>() { @Override public boolean apply(Word word) { return word.getConfidenceLevel() > 1; } })); Collections.sort(words, new Comparator<Word>() { @Override public int compare(Word word1, Word word2) { return word2.tf.get() - word1.tf.get(); } }); return words; } class Word { private String word; private AtomicInteger tf; private StringBuilder left; private StringBuilder right; private Double level = null; private boolean isAllEnglish = false; Word(String word) { this.word = word; this.tf = new AtomicInteger(1); } public String getWord() { return word; } public AtomicInteger getTf() { return tf; } public void leftAdd(String str) { if(left == null) this.left = new StringBuilder(3); if(this.left.indexOf(str) < 0) this.left.append(str); } public int getLeftNum() { if(left == null) return 0; return new WordArray(left.toString()).wordLen(); } public void rightAdd(String str) { if(right == null) this.right = new StringBuilder(3); if(this.right.indexOf(str) < 0) this.right.append(str); } public int getRightNum() { if(right == null) return 0; return new WordArray(right.toString()).wordLen(); } public void setAllEnglish(boolean allEnglish) { isAllEnglish = allEnglish; } private Double getConfidenceLevel() { if (this.level != null) return this.level; double allDomSize = FindWordsByWordArray.this.allDomSize; if (this.getWord().replaceAll("N","").length() <= 1) return 0d; if (this.getTf().get() < allDomSize / 90) return 0d; double value; if (!this.isAllEnglish) { if (this.getLeftNum() < allDomSize / 190) return 0d; if (this.getRightNum() < allDomSize / 190) return 0d; if ((this.getRightNum() + this.getLeftNum()) < allDomSize / 90) return 0d; value = Double.MAX_VALUE; WordArray wordArray = new WordArray(this.getWord()); for (int i = 1; i < wordArray.wordLen(); i++) { int leftTf = wordsMap.get(wordArray.subWords(0, i)).getTf().get(); int rightTf = wordsMap.get(wordArray.subWords(i)).getTf().get(); double normal = leftTf * rightTf / (allTextLen * allTextLen); double reality = this.getTf().get() * 2 / allTextLen; value = reality / normal < value ? reality / normal : value; } int size = this.getLeftNum() > this.getRightNum() ? this.getRightNum() : this.getLeftNum(); value = Math.pow(value, mutualInformationPunish) * Math.pow(size, leftAndRightEntropyPunish) / wholePunish; } else { value = this.getTf().get() * 15 / allDomSize; } this.level = value; return value; } @Override public String toString() { return "Word{" + "word='" + word + '\'' + ", tf=" + tf + ", left=" + cutOff(left.toString(), 15) + ", right=" + cutOff(right.toString(), 15) + '}'; } public String toTab() { return word + '\t' + tf + '\t' + level + '\t' + getLeftNum() + '\t' + getRightNum(); } private String cutOff(String str, int max) { if (str.length() > max) str = str.substring(0, max) + "...]"; return "(" + new WordArray(str).wordLen() + ")" + str; } } public void setWordMaxLen(int wordMaxLen) { this.wordMaxLen = wordMaxLen; } public void setMutualInformationPunish(double mutualInformationPunish) { this.mutualInformationPunish = mutualInformationPunish; } public void setLeftAndRightEntropyPunish(double leftAndRightEntropyPunish) { this.leftAndRightEntropyPunish = leftAndRightEntropyPunish; } public void setWholePunish(double wholePunish) { this.wholePunish = wholePunish; } public static void main(String[] args) throws IOException { String inputPath = "e:/xiaoshuo.txt"; String outputPath = "e:/xiaoshuo_words"; // String inputPath = "e:/tweet/parse"; // String outputPath = "e:/tweet/words"; File inputFile = new File(inputPath); if (inputFile.isFile()) { File pretreatFile = new File("e:/xiaoshuo_p"); long domSize = pretreatment(new File(inputPath), pretreatFile); System.out.println(domSize); FindWordsByWordArray findWords = getFindWords(domSize); LineIterator list = FileUtils.lineIterator(pretreatFile, "utf-8"); int i = 0; for(String str = list.next(); list.hasNext(); str = list.next()){ findWords.parse(false, str); if(i++ % 500000 == 0) System.out.print("."); } list.close(); FileUtils.writeLines(new File(outputPath), findWords.print()); } else { for (String inputFileName : inputFile.list()) { FindWordsByWordArray findWords = getFindWords(); List<String> list = FileUtils.readLines(new File(inputPath, inputFileName), "utf-8"); findWords.parse(true, Lists.transform(list, new Function<String, String>() { @Override public String apply(String s) { return s.substring(s.split("\t")[0].length()); } })); String outputFileName = inputFileName + "-words."; if (inputFileName.split("\\.").length == 2) outputFileName = inputFileName.split("\\.")[0] + "-words." + inputFileName.split("\\.")[1]; List<String> printList = findWords.print(); if (printList.size() > 500) printList = printList.subList(0, 500); FileUtils.writeLines(new File(outputPath, outputFileName), printList); } } } private static FindWordsByWordArray getFindWords() { return getFindWords(0); } private static FindWordsByWordArray getFindWords(long num) { FindWordsByWordArray findWords = new FindWordsByWordArray(num); findWords.setWordMaxLen(Integer.parseInt(resourceBundle.getString("word.max.len"))); findWords.setMutualInformationPunish( Double.parseDouble(resourceBundle.getString("mutual.information.punish"))); findWords.setLeftAndRightEntropyPunish( Double.parseDouble(resourceBundle.getString("left.and.right.entropy.punish"))); findWords.setWholePunish(Double.parseDouble(resourceBundle.getString("whole.punish"))); return findWords; }
public class WordArray { private String someWord; private List<int[]> enIndexAndLen = null; public WordArray(String someWord) { this.someWord = someWord; char[] chars = someWord.toCharArray(); for(int i = 0, charsLen = chars.length; i<charsLen; i++) { if(CharUtils.isEnglish(chars[i])) { int index = i; while (++i < charsLen && CharUtils.isEnglish(chars[i])); if(enIndexAndLen == null) enIndexAndLen = new ArrayList<int[]>(); enIndexAndLen.add(new int[]{index, i - index}); } } } public String subWords(int beginIndex, int endIndex) { int realityBeginIndex = beginIndex; int realityEndIndex = endIndex; if(enIndexAndLen != null) { for(int[] intArray: enIndexAndLen) { if(intArray[0] < realityBeginIndex) { realityBeginIndex += intArray[1] -1; } if(intArray[0] < realityEndIndex) { realityEndIndex += intArray[1] - 1; } } } return someWord.substring(realityBeginIndex, realityEndIndex); } public String subWords(int beginIndex) { return subWords(beginIndex, wordLen()); } public int wordLen() { int len = someWord.length(); if(enIndexAndLen != null) for(int[] intArray: enIndexAndLen) len -= (intArray[1] - 1); return len; } public String[] getEnglishWords() { if(enIndexAndLen != null) { String[] strings = new String[enIndexAndLen.size()]; int i = 0; for(int[] intArray: enIndexAndLen) strings[i++] = someWord.substring(intArray[0], intArray[0]+intArray[1]); return strings; }else{ return new String[0]; } } public List<String> getChineseWords() { List<String> strings = new ArrayList<String>(); for (char c : someWord.toCharArray()) { if(CharUtils.isEnglish(c)) continue; strings.add(String.valueOf(c)); } return strings; } public static void main(String[] args) { WordArray wordArray = new WordArray("我爱Style江南的music哈"); System.out.println(wordArray.subWords(0, 5)); System.out.println(wordArray.subWords(5, wordArray.wordLen())); System.out.println(wordArray.wordLen()); System.out.println(Arrays.toString(wordArray.getEnglishWords())); } }
#(int)[2-n default=10] word max len. word.max.len = 5 #(double)[1.0-0.0 default=0.3] MutualInformation punish. mutual.information.punish = 0.5 #(double)[1.0-0.0 default=1.0] LeftAndRightEntropy punish. left.and.right.entropy.punish = 1 #(double)[1-n default=10] WholePunish punish. whole.punish = 10