大数据技术挖掘新词实现方法
程序员文章站
2022-04-12 20:19:45
大数据技术挖掘新词实现方法。
代码参考:https://github.com/yanghanxy/New-Word-Detection
import codecs
import re...
大数据技术挖掘新词实现方法。
代码参考:https://github.com/yanghanxy/New-Word-Detection
import codecs import re import pandas as pd import math class WordInfo(object): def __init__(self,text): self.text = text self.left_entropy = 0.0 self.right_entropy = 0.0 self.freq = 0.0 self.pmi = 0.0 self.nub = 0 self.left_word = [] self.right_word = [] def lupdate(self,word): self.left_word.append(word) def rupdate(self,word): self.right_word.append(word) def update(self,word_cad): if len(self.text) > 1: self.computer_pmi(word_cad) self.compute_indexes() def compute_freq(self,length): self.freq = 1.0 * self.nub / length def computer_pmi(self,word_cad): sub_part = [ (self.text[0:i],self.text[i:]) for i in range(1,len(self.text)) ] if len(sub_part) > 0 : self.pmi = min( map(lambda word: math.log(self.freq / word_cad[word[0]].freq / word_cad[word[1]].freq), sub_part)) def compute_entropy(self,_list): length = float(len(_list)) frequence = {} if length == 0: return 0 else: for i in _list: frequence[i] = frequence.get(i, 0) + 1 return sum(map(lambda x: - x / length * math.log(x / length), frequence.values())) def compute_indexes(self): # compute frequency of word,and left/right entropy self.left_entropy = self.compute_entropy(self.left_word) self.right_entropy = self.compute_entropy(self.right_word) class Seg(object): def __init__(self, doc, max_word_len = 5, min_tf=0.000005, min_entropy=0.07, min_pmi=6.0 ): super(Seg, self).__init__() self.doc = doc self.max_word_len = max_word_len self.min_entropy = min_entropy self.min_tf = min_tf self.min_pmi = min_pmi self.word_info = self.get_words(doc) count = float(len(self.word_info)) self.avg_frq = sum(map(lambda w: w.freq, self.word_info)) / count self.avg_entropy = sum(map(lambda w: min(w.left_entropy, w.right_entropy), self.word_info)) / count self.avg_pmi = sum(map(lambda w: w.pmi, self.word_info)) / count filter_function = lambda f: len(f.text) > 1 and f.pmi > self.min_pmi and f.freq > self.min_tf \ and min(f.left_entropy, f.right_entropy) > self.min_entropy self.word_tf_pmi_ent = map(lambda w: (w.text, len(w.text), w.freq, w.pmi, min(w.left_entropy, w.right_entropy)), filter(filter_function, self.word_info)) def extract_cadicateword(self,_doc, _max_word_len): indexes = [] doc_length = len(_doc) for i in range(doc_length): for j in range(i + 1, min(i + 1 + _max_word_len, doc_length + 1)): indexes.append((i, j)) return sorted(indexes, key=lambda _word: _doc[_word[0]:_word[1]]) def get_words(self,doc): pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+') doc = pattern.sub(r'', doc) print(len(doc)) word_index = self.extract_cadicateword(doc, self.max_word_len) word_cad = {} print(len(word_index)) for index in word_index: word = doc[index[0]:index[1]] if word not in word_cad: word_cad[word] = WordInfo(word) word_cad[word].nub += 1 word_cad[word].lupdate(doc[index[0]-1:index[0]]) word_cad[word].rupdate(doc[index[1]:index[1]+1]) length = len(doc) # computing frequency of candicate word and entropy of left/right neighbors for word in word_cad: word_cad[word].compute_freq(length) print('1') # ranking by length of word values = sorted(word_cad.values(), key=lambda x: len(x.text)) print(len(values)) print('2') for v in values: v.update(word_cad) print('3') # ranking by freq return sorted(values, key=lambda v: len(v.text), reverse=False) if __name__ == '__main__': path = 'xiyouji.txt' doc = codecs.open('xiyouji.txt', "r", "utf-8").read() word = Seg(doc, max_word_len=3, min_tf=(1e-08), min_entropy=1.0, min_pmi=3.0) print('avg_frq:' + str(word.avg_frq)) print('avg_pmi:' + str(word.avg_pmi)) print('avg_entropy:' + str(word.avg_entropy)) wordlist = [] for i in word.word_tf_pmi_ent: wordlist.append([i[0], i[1], i[2], i[3], i[4]]) wordlist = sorted(wordlist, key=lambda word: word[3], reverse=True) wordlist = sorted(wordlist, key=lambda word: word[4], reverse=True) seg = pd.DataFrame(wordlist, columns=['word', 'length', 'fre', 'pmi', 'entropy']) seg.to_csv( 'extractword2.csv', index=False, encoding="utf-8")
上一篇: hadoop配置中的一些问题总结
下一篇: struts2 中的日期格式化输出