欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  科技

大数据技术挖掘新词实现方法

程序员文章站 2022-06-24 20:16:46
大数据技术挖掘新词实现方法。 代码参考:https://github.com/yanghanxy/New-Word-Detection import codecs import re...

大数据技术挖掘新词实现方法。

代码参考:https://github.com/yanghanxy/New-Word-Detection

import codecs
import re
import pandas as pd
import math

class WordInfo(object):
    def __init__(self,text):
        self.text = text
        self.left_entropy = 0.0
        self.right_entropy = 0.0
        self.freq = 0.0
        self.pmi = 0.0
        self.nub = 0

        self.left_word = []
        self.right_word = []

    def lupdate(self,word):
        self.left_word.append(word)

    def rupdate(self,word):
        self.right_word.append(word)
    
    def update(self,word_cad):
        if len(self.text) > 1:
            self.computer_pmi(word_cad)
        self.compute_indexes()

    def compute_freq(self,length):
        self.freq = 1.0 * self.nub / length

    def computer_pmi(self,word_cad):
        sub_part = [ (self.text[0:i],self.text[i:]) for i in range(1,len(self.text)) ]
        if len(sub_part) > 0 :
            self.pmi =  min(
                map(lambda word: math.log(self.freq / word_cad[word[0]].freq / word_cad[word[1]].freq), sub_part))
    
    def compute_entropy(self,_list):
        length = float(len(_list))
        frequence = {}
        if length == 0:
            return 0
        else:
            for i in _list:
                frequence[i] = frequence.get(i, 0) + 1
            return sum(map(lambda x: - x / length * math.log(x / length), frequence.values()))
          
    def compute_indexes(self):
        # compute frequency of word,and left/right entropy
        self.left_entropy = self.compute_entropy(self.left_word)
        self.right_entropy = self.compute_entropy(self.right_word)



class Seg(object):

    def __init__(self, doc, max_word_len = 5,  min_tf=0.000005, min_entropy=0.07, min_pmi=6.0 ):
        super(Seg, self).__init__()
        self.doc = doc
        self.max_word_len = max_word_len
        self.min_entropy = min_entropy
        self.min_tf = min_tf
        self.min_pmi = min_pmi
        self.word_info = self.get_words(doc)

        count = float(len(self.word_info))

        self.avg_frq = sum(map(lambda w: w.freq, self.word_info)) / count
        self.avg_entropy = sum(map(lambda w: min(w.left_entropy, w.right_entropy), self.word_info)) / count
        self.avg_pmi = sum(map(lambda w: w.pmi, self.word_info)) / count
        filter_function = lambda f: len(f.text) > 1 and f.pmi > self.min_pmi and f.freq > self.min_tf \
                                    and min(f.left_entropy, f.right_entropy) > self.min_entropy
        self.word_tf_pmi_ent = map(lambda w: (w.text, len(w.text), w.freq, w.pmi, min(w.left_entropy, w.right_entropy)),
                                   filter(filter_function, self.word_info))

    def extract_cadicateword(self,_doc, _max_word_len):
        indexes = []
        doc_length = len(_doc)
        for i in range(doc_length):
            for j in range(i + 1, min(i + 1 + _max_word_len, doc_length + 1)):
                indexes.append((i, j))

        return sorted(indexes, key=lambda _word: _doc[_word[0]:_word[1]])

    def get_words(self,doc):
        pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+')
        doc = pattern.sub(r'', doc)
        print(len(doc))
        word_index = self.extract_cadicateword(doc, self.max_word_len)
        word_cad = {}
        print(len(word_index))
        for index in word_index:
            word = doc[index[0]:index[1]]
            if word not in word_cad:
                word_cad[word] = WordInfo(word)
            word_cad[word].nub += 1
            word_cad[word].lupdate(doc[index[0]-1:index[0]])
            word_cad[word].rupdate(doc[index[1]:index[1]+1])

        length = len(doc)
        # computing frequency of candicate word and entropy of left/right neighbors
        for word in word_cad:
            word_cad[word].compute_freq(length)
        print('1')

        # ranking by length of word
        values = sorted(word_cad.values(), key=lambda x: len(x.text))
        print(len(values))
        print('2')
        for v in values:
            v.update(word_cad)

        print('3')
        # ranking by freq
        return sorted(values, key=lambda v: len(v.text), reverse=False)

if __name__ == '__main__':
    path = 'xiyouji.txt'
    doc = codecs.open('xiyouji.txt', "r", "utf-8").read()

    word = Seg(doc, max_word_len=3, min_tf=(1e-08), min_entropy=1.0, min_pmi=3.0)
    print('avg_frq:' + str(word.avg_frq))
    print('avg_pmi:' + str(word.avg_pmi))
    print('avg_entropy:' + str(word.avg_entropy))

    wordlist = []
    for i in word.word_tf_pmi_ent:
        wordlist.append([i[0], i[1], i[2], i[3], i[4]])

    wordlist = sorted(wordlist, key=lambda word: word[3], reverse=True)
    wordlist = sorted(wordlist, key=lambda word: word[4], reverse=True)

    seg = pd.DataFrame(wordlist, columns=['word', 'length', 'fre', 'pmi', 'entropy'])
    seg.to_csv( 'extractword2.csv', index=False, encoding="utf-8")