互联网时代的社会语言学：基于SNS的文本数据挖掘

程序员文章站 2022-07-14 23:27:45

...

互联网时代的社会语言学：基于SNS的文本数据挖掘
本文转载于http://www.matrix67.com/blog/archives/5044

几个概念

凝固度
我们定义“电影院”的凝合程度就是 p(电影院) 与 p(电) · p(影院) 比值和 p(电影院) 与 p(电影) · p(院) 的比值中的较小值，“的电影”的凝合程度则是 p(的电影) 分别除以 p(的) · p(电影) 和 p(的电) · p(影) 所得的商的较小值。

*度
我们不妨就把一个文本片段的*运用程度定义为它的左邻字信息熵和右邻字信息熵中的较小值。

java实现，100M文本效果还可以，但大于100M以后内存会溢出

public class FindWordsByWordArray {  
  
    private final static ResourceBundle resourceBundle = ResourceBundle.getBundle("finder");  
  
    private Map<String, Word> wordsMap = new HashMap<String, Word>();  
  
    private int wordMaxLen = 5;  
    private double allTextLen = 0;  
    private double allDomSize = 0;  
    private double mutualInformationPunish = 0.5;  
    private double leftAndRightEntropyPunish = 1;  
    private double wholePunish = 10;  
  
    public FindWordsByWordArray() {  
    }  
  
    public FindWordsByWordArray(long num) {  
        this.allDomSize = num;  
    }  
  
    public static long pretreatment(File input, File output) throws IOException {  
  
        if(output.exists())  
            FileUtils.deleteQuietly(output);  
  
        LineIterator list = FileUtils.lineIterator(input, "utf-8");  
  
        List<String> res = new ArrayList<String>();  
  
        long num = 0;  
        for (String text = list.next(); list.hasNext(); text = list.next()) {  
              
            num++;  
              
            res.addAll(pretreatment(text));  
  
            if(res.size() > 500000) {  
                FileUtils.writeLines(output, res, true);  
                System.out.println("write lines 500000.");  
                res.clear();  
            }  
  
        }  
        list.close();  
  
        if(res.size() > 0)  
            FileUtils.writeLines(output, res, true);  
  
        System.out.println("pretreatment over.");  
        return num;  
    }  
  
    private static List<String> pretreatment(String... texts) {  
        List<String> res = new ArrayList<String>();  
          
        for(String text: texts)  {  
            text = text.toLowerCase().replaceAll("\\d", "N")  
                    .replaceAll("(\\p{P}|\\s+|&[a-zA-Z]*;|[a-zA-z]+://[^\\s]*|~|～|★)", "#")  
                    .replace('.', '#')  
                    .replace('+', '#')  
                    .replace('|', '#')  
                    .replace('>', '#');  
  
            for (String some : text.split("#")) {  
                if (some.length() < 5)  
                    continue;  
                res.add(some);  
            }  
        }  
        return res;  
    }  
  
    public void parse(boolean needPretreatment, String... texts) {  
        if(needPretreatment) {  
            allDomSize += texts.length;  
            parse(false, pretreatment(texts));  
            return;  
        }  
        for (String text : texts) {  
            if (text.matches("^[a-zA-Z]*")) {  
                parseEnglish(text);  
                allTextLen += 1;  
            }else {  
                parseChinese(text);  
                allTextLen += text.length();  
            }  
        }  
    }  
  
    private void parseEnglish(String text) {  
        addEnglishWord(text);  
    }  
  
    private void parseChinese(String text) {  
  
        WordArray wordArray = new WordArray(text);  
        String left = null;  
        int thisWordMaxLen = wordMaxLen;  
  
        for (int index = 0, textLen = wordArray.wordLen(); index < textLen - 1; index++) {  
            for (int i = 2; i <= thisWordMaxLen; i++) {  
                int toIndex = index + i;  
                if (toIndex > textLen)  
                    break;  
                String word = wordArray.subWords(index, toIndex);  
                addWord(word);  
                if (left != null)  
                    wordsMap.get(word).leftAdd(left);  
                if (toIndex + 1 <= textLen)  
                    wordsMap.get(word).rightAdd(wordArray.subWords(toIndex, toIndex + 1));  
            }  
            left = wordArray.subWords(index, index + 1);  
        }  
  
        for (String s : wordArray.getChineseWords()) {  
           addWord(s);  
        }  
        for (String s : wordArray.getEnglishWords()) {  
            addEnglishWord(s);  
        }  
    }  
  
    private void addWord(String word) {  
        if (word.length() == 0)  
            throw new IllegalArgumentException("word length is 0.");  
        if (wordsMap.containsKey(word))  
            wordsMap.get(word).getTf().incrementAndGet();  
        else  
            wordsMap.put(word, new Word(word));  
    }  
  
    private void addEnglishWord(String word) {  
        addWord(word);  
        wordsMap.get(word).setAllEnglish(true);  
    }  
  
    public void parse(boolean needPretreatment, Collection<String> texts) {  
        parse(needPretreatment, texts.toArray(new String[texts.size()]));  
    }  
  
    public List<String> print() {  
        return print(getRes());  
    }  
  
    public List<String> print(List<Word> words) {  
        List<String> res = new ArrayList<String>();  
        for (Word word : words) {  
            res.add(word.toTab());  
        }  
        return res;  
    }  
  
    public List<Word> getRes() {  
        List<Word> words = new ArrayList<Word>(wordsMap.values());  
        words = Lists.newArrayList(Collections2.filter(words, new Predicate<Word>() {  
            @Override  
            public boolean apply(Word word) {  
                return word.getConfidenceLevel() > 1;  
            }  
        }));  
        Collections.sort(words, new Comparator<Word>() {  
            @Override  
            public int compare(Word word1, Word word2) {  
                return word2.tf.get() - word1.tf.get();  
            }  
        });  
        return words;  
    }  
  
    class Word {  
  
        private String word;  
        private AtomicInteger tf;  
        private StringBuilder left;  
        private StringBuilder right;  
        private Double level = null;  
        private boolean isAllEnglish = false;  
  
        Word(String word) {  
            this.word = word;  
            this.tf = new AtomicInteger(1);  
        }  
  
        public String getWord() {  
            return word;  
        }  
  
        public AtomicInteger getTf() {  
            return tf;  
        }  
  
        public void leftAdd(String str) {  
            if(left == null)  
                this.left = new StringBuilder(3);  
            if(this.left.indexOf(str) < 0)  
                this.left.append(str);  
        }  
  
        public int getLeftNum() {  
            if(left == null)  
                return 0;  
            return new WordArray(left.toString()).wordLen();  
        }  
  
        public void rightAdd(String str) {  
            if(right == null)  
                this.right = new StringBuilder(3);  
            if(this.right.indexOf(str) < 0)  
                this.right.append(str);  
        }  
  
        public int getRightNum() {  
            if(right == null)  
                return 0;  
            return new WordArray(right.toString()).wordLen();  
        }  
  
        public void setAllEnglish(boolean allEnglish) {  
            isAllEnglish = allEnglish;  
        }  
  
        private Double getConfidenceLevel() {  
            if (this.level != null)  
                return this.level;  
  
            double allDomSize = FindWordsByWordArray.this.allDomSize;  
  
            if (this.getWord().replaceAll("N","").length() <= 1)  
                return 0d;  
            if (this.getTf().get() < allDomSize / 90)  
                return 0d;  
            double value;  
            if (!this.isAllEnglish) {  
  
                if (this.getLeftNum() < allDomSize / 190)  
                    return 0d;  
                if (this.getRightNum() < allDomSize / 190)  
                    return 0d;  
                if ((this.getRightNum() + this.getLeftNum()) < allDomSize / 90)  
                    return 0d;  
                value = Double.MAX_VALUE;  
  
                WordArray wordArray = new WordArray(this.getWord());  
  
                for (int i = 1; i < wordArray.wordLen(); i++) {  
  
                    int leftTf = wordsMap.get(wordArray.subWords(0, i)).getTf().get();  
  
                    int rightTf = wordsMap.get(wordArray.subWords(i)).getTf().get();  
  
                    double normal = leftTf * rightTf / (allTextLen * allTextLen);  
  
                    double reality = this.getTf().get() * 2 / allTextLen;  
  
                    value = reality / normal < value ? reality / normal : value;  
                }  
  
                int size = this.getLeftNum() > this.getRightNum() ?  
                        this.getRightNum() : this.getLeftNum();  
  
                value = Math.pow(value, mutualInformationPunish) *  
                        Math.pow(size, leftAndRightEntropyPunish)  
                        / wholePunish;  
            } else {  
                value = this.getTf().get() * 15 / allDomSize;  
            }  
            this.level = value;  
            return value;  
        }  
  
        @Override  
        public String toString() {  
            return "Word{" +  
                    "word='" + word + '\'' +  
                    ", tf=" + tf +  
                    ", left=" + cutOff(left.toString(), 15) +  
                    ", right=" + cutOff(right.toString(), 15) +  
                    '}';  
        }  
  
        public String toTab() {  
            return word + '\t' +  
                    tf + '\t' +  
                    level + '\t' +  
                    getLeftNum() + '\t' +  
                    getRightNum();  
        }  
  
        private String cutOff(String str, int max) {  
              if (str.length() > max)  
                str = str.substring(0, max) + "...]";  
            return "(" + new WordArray(str).wordLen() + ")" + str;  
        }  
    }  
  
    public void setWordMaxLen(int wordMaxLen) {  
        this.wordMaxLen = wordMaxLen;  
    }  
  
    public void setMutualInformationPunish(double mutualInformationPunish) {  
        this.mutualInformationPunish = mutualInformationPunish;  
    }  
  
    public void setLeftAndRightEntropyPunish(double leftAndRightEntropyPunish) {  
        this.leftAndRightEntropyPunish = leftAndRightEntropyPunish;  
    }  
  
    public void setWholePunish(double wholePunish) {  
        this.wholePunish = wholePunish;  
    }  
  
    public static void main(String[] args) throws IOException {  
  
        String inputPath = "e:/xiaoshuo.txt";  
        String outputPath = "e:/xiaoshuo_words";  
  
//        String inputPath = "e:/tweet/parse";  
//        String outputPath = "e:/tweet/words";  
  
        File inputFile = new File(inputPath);  
  
        if (inputFile.isFile()) {  
            File pretreatFile = new File("e:/xiaoshuo_p");  
            long domSize = pretreatment(new File(inputPath), pretreatFile);  
            System.out.println(domSize);  
            FindWordsByWordArray findWords = getFindWords(domSize);  
            LineIterator list = FileUtils.lineIterator(pretreatFile, "utf-8");  
            int i = 0;  
            for(String str = list.next(); list.hasNext(); str = list.next()){  
                findWords.parse(false, str);  
                if(i++ % 500000 == 0)  
                    System.out.print(".");  
            }                     
            list.close();  
            FileUtils.writeLines(new File(outputPath), findWords.print());  
        } else {  
            for (String inputFileName : inputFile.list()) {  
                FindWordsByWordArray findWords = getFindWords();  
                List<String> list = FileUtils.readLines(new File(inputPath, inputFileName), "utf-8");  
                findWords.parse(true, Lists.transform(list, new Function<String, String>() {  
                    @Override  
                    public String apply(String s) {  
                        return s.substring(s.split("\t")[0].length());  
                    }  
                }));  
                String outputFileName = inputFileName + "-words.";  
                if (inputFileName.split("\\.").length == 2)  
                    outputFileName = inputFileName.split("\\.")[0] + "-words." +  
                            inputFileName.split("\\.")[1];  
                List<String> printList = findWords.print();  
                if (printList.size() > 500)  
                    printList = printList.subList(0, 500);  
                FileUtils.writeLines(new File(outputPath, outputFileName), printList);  
            }  
        }  
    }  
  
    private static FindWordsByWordArray getFindWords() {  
        return getFindWords(0);  
    }  
  
    private static FindWordsByWordArray getFindWords(long num) {  
        FindWordsByWordArray findWords = new FindWordsByWordArray(num);  
        findWords.setWordMaxLen(Integer.parseInt(resourceBundle.getString("word.max.len")));  
        findWords.setMutualInformationPunish(  
                Double.parseDouble(resourceBundle.getString("mutual.information.punish")));  
        findWords.setLeftAndRightEntropyPunish(  
                Double.parseDouble(resourceBundle.getString("left.and.right.entropy.punish")));  
        findWords.setWholePunish(Double.parseDouble(resourceBundle.getString("whole.punish")));  
        return findWords;  
    }

public class WordArray {  
  
    private String someWord;  
    private List<int[]> enIndexAndLen = null;  
  
    public WordArray(String someWord) {  
        this.someWord = someWord;  
        char[] chars = someWord.toCharArray();  
        for(int i = 0, charsLen = chars.length; i<charsLen; i++) {  
            if(CharUtils.isEnglish(chars[i])) {  
                int index = i;  
                while (++i < charsLen && CharUtils.isEnglish(chars[i]));  
                if(enIndexAndLen == null)  
                    enIndexAndLen = new ArrayList<int[]>();  
                enIndexAndLen.add(new int[]{index, i - index});  
            }  
        }  
    }  
      
    public String subWords(int beginIndex, int endIndex) {  
        int realityBeginIndex = beginIndex;  
        int realityEndIndex = endIndex;  
        if(enIndexAndLen != null) {  
            for(int[] intArray: enIndexAndLen) {  
                if(intArray[0] < realityBeginIndex) {  
                    realityBeginIndex += intArray[1] -1;  
                }  
                if(intArray[0] < realityEndIndex) {  
                    realityEndIndex += intArray[1] - 1;  
                }  
            }  
        }  
        return someWord.substring(realityBeginIndex, realityEndIndex);  
    }  
  
    public String subWords(int beginIndex) {  
        return subWords(beginIndex, wordLen());  
    }  
  
    public int wordLen() {  
        int len = someWord.length();  
        if(enIndexAndLen != null)  
            for(int[] intArray: enIndexAndLen)  
                len -= (intArray[1] - 1);  
        return len;  
    }  
  
    public String[] getEnglishWords() {  
        if(enIndexAndLen != null) {  
            String[] strings = new String[enIndexAndLen.size()];  
            int i = 0;  
            for(int[] intArray: enIndexAndLen)  
                strings[i++] = someWord.substring(intArray[0], intArray[0]+intArray[1]);  
            return strings;  
        }else{  
            return new String[0];  
        }  
    }  
  
    public List<String> getChineseWords() {  
        List<String> strings = new ArrayList<String>();  
        for (char c : someWord.toCharArray()) {  
            if(CharUtils.isEnglish(c))  
                continue;  
            strings.add(String.valueOf(c));  
        }  
        return strings;  
    }  
  
    public static void main(String[] args) {  
        WordArray wordArray = new WordArray("我爱Style江南的music哈");  
        System.out.println(wordArray.subWords(0, 5));  
        System.out.println(wordArray.subWords(5, wordArray.wordLen()));  
        System.out.println(wordArray.wordLen());  
        System.out.println(Arrays.toString(wordArray.getEnglishWords()));  
    }  
  
}

#(int)[2-n default=10] word max len.  
word.max.len = 5  
  
#(double)[1.0-0.0 default=0.3] MutualInformation punish.  
mutual.information.punish = 0.5  
  
#(double)[1.0-0.0 default=1.0] LeftAndRightEntropy punish.  
left.and.right.entropy.punish = 1  
  
#(double)[1-n default=10] WholePunish punish.  
whole.punish = 10

相关标签：数据挖掘互联网 sns

上一篇：全球互联网已进入SNS时代

下一篇：简洁版输入智能提示框