训练中文词向量(dat转化为txt文件,分词训练)
程序员文章站
2023-03-07 21:21:16
预处理将dat转换为txt文件import chardetimport codecsimport re # detect file encode type file_path = '/home/ricardo/out/news_sohusite_xml.dat'# read filef2 = codecs.open(file_path, encoding='GB2312', errors="ignore")content2 = f2.read();f2.close()# write...
预处理
将dat转换为txt文件
import chardet import codecs import re # detect file encode type file_path = '/home/ricardo/out/news_sohusite_xml.dat' # read file f2 = codecs.open(file_path, encoding='GB2312', errors="ignore") content2 = f2.read(); f2.close() # write to text file f = codecs.open('/home/ricardo/out/news_sohusite_xml.txt', 'w',encoding='utf8'); # exact the text between <content> and </content> a = re.findall('<content>.*</content>', content2) print("Length of list: %d" % len(a)) i = 0; for item in a: b = item.replace('<content>',''); b = b.replace('</content>',''); f.write(str(b)+'\n'); i = i+1; if i%1000 == 0: print("index: %d / %d" % (i,len(a))); f.close();
去停用词、分词
import jieba
jieba.enable_parallel() # 创建停用词列表 def stopwordslist(): stopwords = [line.strip() for line in open('/home/ricardo/stopwords/hit_stopwords.txt',encoding='UTF-8').readlines()] return stopwords # 对句子进行中文分词 def seg_depart(sentence): # 对文档中的每一行进行中文分词 print("正在分词") sentence_depart = jieba.cut(sentence.strip()) # 创建一个停用词列表 stopwords = stopwordslist() # 输出结果为outstr outstr = '' # 去停用词 for word in sentence_depart: if word not in stopwords: if word != '\t': outstr += word
outstr += " " return outstr # 给出文档路径 filename = "/home/ricardo/out/1.txt" outfilename = "/home/ricardo/outout.txt" inputs = open(filename, 'r', encoding='UTF-8') outputs = open(outfilename, 'w', encoding='UTF-8') # 将输出结果写入ou.txt中 for line in inputs: line_seg = seg_depart(line) outputs.write(line_seg + '\n') outputs.close() inputs.close()
训练
from gensim.models import word2vec import multiprocessing def train_wordVectors(sentences, embedding_size = 128, window = 5, min_count = 5): w2vModel = word2vec.Word2Vec(sentences, size=embedding_size, window=window, min_count=min_count,workers=multiprocessing.cpu_count()) return w2vModel def save_wordVectors(w2vModel,word2vec_path): w2vModel.save(word2vec_path) def load_wordVectors(word2vec_path): w2vModel = word2vec.Word2Vec.load(word2vec_path) return w2vModel if __name__=='__main__': # 若只有一个文件,使用LineSentence读取文件 sentences = word2vec.LineSentence('/home/ricardo/out.txt') # 若存在多文件,使用PathLineSentences读取文件列表 #segment_dir='/words/' #sentences = word2vec.PathLineSentences(segment_dir) # 一般训练,设置以下几个参数即可: word2vec_path='/home/ricardo/word2Vec.model' model2=train_wordVectors(sentences, embedding_size=128, window=5, min_count=5) save_wordVectors(model2,word2vec_path) model2=load_wordVectors(word2vec_path) print(model2.wv.similarity('你好', '您好'))
本文地址:https://blog.csdn.net/Ricardo98/article/details/107157024
上一篇: 使用Python 统计高频字数的方法