week2
程序员文章站
2022-07-12 14:54:53
...
import re # 正则化
import pandas as pd
from collections import defaultdict # 计算词频
from time import time
import spacy # 用来预处理
import logging # 设置日志用来监控gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s",datefmt='%H:%M:%S',level=logging.INFO)
import multiprocessing
from gensim.models import Word2Vec
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
df = pd.read_csv('./simpsons_dataset.csv')
df.shape
(158314, 2)
df.head(10)
raw_character_text | spoken_words | |
---|---|---|
0 | Miss Hoover | No, actually, it was a little of both. Sometim... |
1 | Lisa Simpson | Where's Mr. Bergstrom? |
2 | Miss Hoover | I don't know. Although I'd sure like to talk t... |
3 | Lisa Simpson | That life is worth living. |
4 | Edna Krabappel-Flanders | The polls will be open from now until the end ... |
5 | Martin Prince | I don't think there's anything left to say. |
6 | Edna Krabappel-Flanders | Bart? |
7 | Bart Simpson | Victory party under the slide! |
8 | NaN | NaN |
9 | Lisa Simpson | Mr. Bergstrom! Mr. Bergstrom! |
去掉空值
df.isnull().sum() # 计算一共有多少空值
raw_character_text 17814
spoken_words 26459
dtype: int64
df = df.dropna().reset_index(drop=True) #去掉空值并重新编码index
df.isnull().sum()
raw_character_text 0
spoken_words 0
dtype: int64
清洗 ,英文的预处理包括 去除停用词,大写转小写,词干提取和词型还原
nlp = spacy.load('en_core_web_sm',disable=['ner','parser']) # 关闭命名实体识别,提高处理速度
def cleaning(doc):
# doc 是spacy中的doc 类
# 去除停用词
txt = [token.lemma_ for token in doc if not token.is_stop]
#去掉只有1-2个词的句子,这些句子对训练贡献度不大
if len(txt) > 2:
return ' '.join(txt)
brif_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])
t = time()
txt = [cleaning(doc) for doc in nlp.pipe(brif_cleaning,batch_size=5000,n_threads=-1)]
print(f'清理时间:{round((time() - t) / 60, 2)}')
清理时间:1.12
df_clean = pd.DataFrame({'clean':txt}) # 新建一张表存放干净数据
df_clean = df_clean.dropna().drop_duplicates().reset_index(drop=True) # 去除数据中心的重复值
print(df_clean.shape)
df_clean.head()
(85964, 1)
clean | |
---|---|
0 | actually little disease magazine news show nat... |
1 | know sure like talk touch lesson plan teach |
2 | life worth live |
3 | poll open end recess case decide thought final... |
4 | victory party slide |
bigrams
使用 Gensim Phrases package 自动检测文本中的被分成单个word的 二元短语。
from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
INFO - 18:26:39: collecting all words and their counts
INFO - 18:26:39: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 18:26:39: PROGRESS: at sentence #10000, processed 63561 words and 52816 word types
INFO - 18:26:39: PROGRESS: at sentence #20000, processed 130943 words and 99866 word types
INFO - 18:26:39: PROGRESS: at sentence #30000, processed 192972 words and 138532 word types
INFO - 18:26:39: PROGRESS: at sentence #40000, processed 249842 words and 172659 word types
INFO - 18:26:40: PROGRESS: at sentence #50000, processed 311265 words and 208566 word types
INFO - 18:26:40: PROGRESS: at sentence #60000, processed 373588 words and 243702 word types
INFO - 18:26:40: PROGRESS: at sentence #70000, processed 436441 words and 278740 word types
INFO - 18:26:40: PROGRESS: at sentence #80000, processed 497829 words and 311886 word types
INFO - 18:26:40: collected 330804 word types from a corpus of 537160 words (unigram + bigrams) and 85964 sentences
INFO - 18:26:40: using 330804 counts as vocab in Phrases<0 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 18:26:40: source_vocab length 330804
INFO - 18:26:42: Phraser built with 126 phrasegrams
print(len(sent),len(sent[0]))
85964 8
sentences = bigram[sent]
for i in sentences:
print(i)
break
['actually', 'little', 'disease', 'magazine', 'news', 'show', 'natural', 'think']
计算频次
word_freq = defaultdict(int)
for sent in sentences:
for i in sent:
word_freq[i] += 1
len(word_freq)
30178
sorted(word_freq,key=word_freq.get,reverse=True)[:10] # reverse=true ---> 使用降序排列
['oh', 'like', 'know', 'get', 'hey', 'think', 'right', 'look', 'want', 'come']
训练模型
使用 gensim Word2Vec implementation
cores = multiprocessing.cpu_count()# 自动读取计算机cpu的核数
w2v_model = Word2Vec(min_count=20,
window=3,
size=300,
sample=6e-5,
min_alpha=0.0007,
negative=20,
workers=cores-1)
word2vec 要求建立词汇表,该词汇表中所有词唯一------》语料库
t = time()
w2v_model.build_vocab(sentences,progress_per=10000)
print(f'建立词表的时间:{round((time()-t)/60,2)}')
INFO - 19:19:00: collecting all words and their counts
INFO - 19:19:00: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 19:19:01: PROGRESS: at sentence #10000, processed 61718 words, keeping 9558 word types
INFO - 19:19:01: PROGRESS: at sentence #20000, processed 127351 words, keeping 14506 word types
INFO - 19:19:01: PROGRESS: at sentence #30000, processed 187829 words, keeping 17619 word types
INFO - 19:19:01: PROGRESS: at sentence #40000, processed 243332 words, keeping 20385 word types
INFO - 19:19:01: PROGRESS: at sentence #50000, processed 303182 words, keeping 22878 word types
INFO - 19:19:01: PROGRESS: at sentence #60000, processed 363940 words, keeping 25200 word types
INFO - 19:19:02: PROGRESS: at sentence #70000, processed 425408 words, keeping 27401 word types
INFO - 19:19:02: PROGRESS: at sentence #80000, processed 485464 words, keeping 29275 word types
INFO - 19:19:02: collected 30178 word types from a corpus of 523700 raw words and 85964 sentences
INFO - 19:19:02: Loading a fresh vocabulary
INFO - 19:19:02: effective_min_count=20 retains 3319 unique words (10% of original 30178, drops 26859)
INFO - 19:19:02: effective_min_count=20 leaves 437324 word corpus (83% of original 523700, drops 86376)
INFO - 19:19:02: deleting the raw counts dictionary of 30178 items
INFO - 19:19:02: sample=6e-05 downsamples 1200 most-common words
INFO - 19:19:02: downsampling leaves estimated 199161 word corpus (45.5% of prior 437324)
INFO - 19:19:02: estimated required memory for 3319 words and 300 dimensions: 9625100 bytes
INFO - 19:19:02: resetting layer weights
建立词表的时间:0.03
print(type(w2v_model.wv.vocab))
w2v_model.wv.vocab
训练模型
t = time()
w2v_model.train(sentences,total_examples=w2v_model.corpus_count,epochs=30,report_delay=1)
print(f'训练时间:{round((time()-t)/60,2)}')
提取embedding
emb = w2v_model[w2v_model.wv.vocab]
emb.shape
w2v_model['bart']
array([ 4.18446735e-02, 1.65442228e-02, 1.10797070e-01, -7.40554929e-02,
1.05384504e-02, -8.67301524e-02, -7.87724778e-02, -5.20401150e-02,
1.33793727e-01, 3.46963629e-02, -2.17480376e-01, -7.15462416e-02,
-1.61051452e-01, -1.99810088e-01, 9.12169646e-03, -9.42036584e-02,
-6.04747888e-03, -1.38541222e-01, 1.89868629e-01, -2.10961729e-01,
8.02108720e-02, 2.26279184e-01, 3.21998298e-02, -1.60336062e-01,
2.57940203e-01, 1.48253620e-01, -5.82630187e-02, 1.63063720e-01,
-6.18373118e-02, 1.13122966e-02, -7.05385394e-03, 2.98151486e-02,
-8.10188893e-03, 1.07171334e-01, -2.07827196e-01, -3.32180709e-02,
-9.11904871e-02, 1.33428872e-01, -6.60030916e-02, -7.22064450e-02,
-9.73893777e-02, -1.38411224e-01, 1.06587082e-01, 3.06930810e-01,
1.21861070e-01, 6.25069886e-02, 1.11081287e-01, 3.12810302e-01,
-1.36235222e-01, -1.74878046e-01, 3.45079690e-01, -8.53907391e-02,
-1.50433471e-02, 2.10457683e-01, 2.05016449e-01, -1.22747660e-01,
-7.99397677e-02, 5.56420013e-02, -2.19099805e-01, 1.37577718e-02,
-1.98015675e-01, 3.68048176e-02, 2.55097359e-01, 1.42521381e-01,
-1.74239323e-01, -1.60447344e-01, -1.14176542e-01, 4.34005074e-02,
-7.26121441e-02, -1.10671021e-01, 4.32553664e-02, 9.04392973e-02,
1.10290693e-02, -5.20448163e-02, 9.85610560e-02, 2.75455356e-01,
1.75303444e-01, 1.96430776e-02, 4.74824645e-02, -4.69678128e-03,
1.24523170e-01, -6.56014532e-02, 4.20933915e-03, 1.69960871e-01,
-5.21478467e-02, 2.65770376e-01, -8.78552571e-02, 3.88482511e-02,
-1.07387975e-01, -5.63755259e-02, 1.99031970e-03, 2.76443493e-02,
7.06845708e-03, 2.84465104e-01, 2.21011624e-01, -3.85178253e-02,
-1.01346865e-01, 9.94836092e-02, 4.41770032e-02, -9.77715254e-02,
7.33498037e-02, -8.46569166e-02, -7.47960806e-02, -7.11960427e-04,
3.07233363e-01, -1.10685572e-01, -1.24751190e-02, -1.11756846e-02,
2.13319406e-01, 2.04286054e-02, 1.20374531e-01, -2.99752001e-02,
-3.65924381e-04, 7.82198086e-02, 1.81546062e-01, 4.79292460e-02,
2.16977030e-01, -1.61388963e-01, -4.21743877e-02, -1.01881206e-01,
-3.01146686e-01, 7.50813857e-02, -1.08071931e-01, 1.55717328e-01,
1.53191417e-01, 3.69453616e-02, -1.08489737e-01, 4.88367081e-02,
-1.10981829e-01, -6.93725795e-02, -8.09453987e-03, 2.67264135e-02,
-1.95200756e-01, 1.01056635e-01, -8.75088498e-02, -1.28447443e-01,
3.45552564e-02, -9.89269093e-02, -4.25559208e-02, 2.57533062e-02,
1.89986974e-01, -1.44026443e-01, -1.93969890e-01, 2.09505092e-02,
2.82953501e-01, -9.23318416e-02, 1.31291091e-01, 4.14298102e-02,
-2.13174403e-01, 2.78377771e-01, 1.17607720e-01, -2.78163459e-02,
1.38917018e-03, -1.52276844e-01, 9.76652652e-02, 1.26440182e-01,
1.75357282e-01, 1.90284744e-01, -6.16173074e-02, -1.24567248e-01,
-1.66846756e-02, -4.47729137e-03, 3.53813171e-02, 3.45557369e-02,
2.48954177e-01, 7.11286217e-02, -1.55285239e-01, 2.42902517e-01,
-4.18721773e-02, 4.39957678e-02, -1.11102581e-03, 2.41308566e-03,
1.83383003e-01, 1.76997818e-02, 5.44071831e-02, -2.58670360e-01,
-1.49985790e-01, -2.73436643e-02, 2.01459810e-01, 6.82130978e-02,
1.23596974e-01, -2.34090284e-01, -1.53957993e-01, -9.87507217e-03,
1.37849823e-01, -2.85730124e-01, -4.11835425e-02, 1.26899734e-01,
2.18792573e-01, 3.52200530e-02, 4.75526340e-02, -2.48019665e-01,
-1.89031705e-01, -2.19905317e-01, -3.67614664e-02, 8.95649642e-02,
1.88915104e-01, 8.42700228e-02, 2.24755466e-01, -9.62386131e-02,
1.06391780e-01, -2.73552071e-02, 2.54500121e-01, 6.35677800e-02,
-5.63272350e-02, 1.93823352e-01, -4.65679504e-02, -7.59726316e-02,
7.39268363e-02, -9.30521116e-02, -6.66098073e-02, 4.74396767e-03,
9.39877108e-02, 2.36539021e-01, -7.99848363e-02, 9.79661122e-02,
3.83246318e-02, -1.96012408e-01, 2.95899156e-03, -2.84999460e-02,
9.95977446e-02, -3.20668489e-01, 2.39799485e-01, 6.70266449e-02,
-2.58743227e-01, 1.39223203e-01, 8.49272124e-04, -1.75515890e-01,
2.48446628e-01, -2.20555544e-01, 2.86590531e-02, 1.39220208e-01,
5.15840352e-02, -1.65669516e-01, -8.11747313e-02, 9.64312404e-02,
-7.05874413e-02, -6.21024594e-02, -2.29118168e-01, 9.39650983e-02,
1.45855337e-01, -1.87468886e-01, 1.90643996e-01, -3.45012844e-02,
2.84452856e-01, 5.11127859e-02, -5.48401065e-02, -5.97156473e-02,
-2.88056552e-01, -3.07915539e-01, 1.89015299e-01, 1.80904984e-01,
-4.66281697e-02, 1.23891816e-01, 6.82776198e-02, 5.76543063e-02,
2.18033895e-01, -3.46376091e-01, 3.39582860e-01, 2.83051759e-01,
-3.10344938e-02, -8.33643749e-02, 6.72021061e-02, -7.77925923e-02,
1.98335961e-01, 1.16051346e-01, -3.86202969e-02, -2.11613551e-02,
1.58307210e-01, -1.35569334e-01, 1.49830515e-02, 7.86971226e-02,
-2.96135008e-01, -1.25737593e-01, -2.85492629e-01, -8.08115602e-02,
1.05703622e-01, 8.19293708e-02, 1.21872239e-01, 1.74724743e-01,
2.01677725e-01, -9.57962796e-02, 6.73509017e-02, 8.34021419e-02,
4.85367507e-01, 1.94445774e-01, 2.31088057e-01, 5.52801378e-02,
-4.68322672e-02, -1.26047000e-01, -9.96087119e-02, 8.41373671e-03,
-8.84510726e-02, -8.66838023e-02, 4.89974730e-02, 6.69751987e-02,
-4.50237483e-01, -1.70215145e-01, -1.03328973e-01, 7.47608989e-02],
dtype=float32)
模型探索
most similar to:
给定一个角色,看看那几个词跟这个角色最相近
w2v_model.wv.most_similar(positive=['homer'])
INFO - 19:38:05: precomputing L2-norms of word weight vectors
[('sweetheart', 0.8782800436019897),
('marge', 0.8706507682800293),
('crummy', 0.8670642971992493),
('gee', 0.8647564649581909),
('ask', 0.8638630509376526),
('happen', 0.8629839420318604),
('nervous', 0.8582974076271057),
('sorry', 0.8531917929649353),
('rude', 0.8500913381576538),
('screw', 0.8495371341705322)]
w2v_model.wv.most_similar(positive=['homer_simpson'])
[('congratulation', 0.9103197455406189),
('select', 0.8930548429489136),
('recent', 0.8911806344985962),
('elect', 0.8883221745491028),
('united_states', 0.8882380127906799),
('fellow', 0.8821471929550171),
('governor', 0.8779288530349731),
('council', 0.8718430995941162),
('clinton', 0.8676393032073975),
('gem', 0.8636420369148254)]
w2v_model.wv.most_similar(positive=['bart'])
[('lisa', 0.9212876558303833),
('mom_dad', 0.9003821015357971),
('mom', 0.8844820857048035),
('upset', 0.880962610244751),
('milhouse', 0.880669891834259),
('selfish', 0.8728194236755371),
('run_away', 0.8721733093261719),
('surprised', 0.8695642948150635),
('strangle', 0.8638743758201599),
('juliet', 0.8549156785011292)]
w2v_model.wv.most_similar(negative=['bart'])
[('con', -0.0002270955592393875),
("'_n", -0.00845993310213089),
('lo', -0.020754491910338402),
('north', -0.027228128165006638),
('la', -0.027274325489997864),
('hel', -0.028816021978855133),
('de', -0.030296267941594124),
('gold', -0.030602240934967995),
('cream', -0.043637119233608246),
('sauce', -0.04565941169857979)]
w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])
WARNING - 19:55:11: vectors for words {'kearney'} are not present in the model, ignoring these words
C:\Users\Lemon\anaconda3\lib\site-packages\gensim\models\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
'jimbo'
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3)
[('tell', 0.8039718866348267),
('admire', 0.8012653589248657),
('imagine', 0.8001151084899902)]
def tsnescatterplot(model, word, list_names):
""" Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
its list of most similar words, and a list of words.
"""
arrays = np.empty((0, 300), dtype='f')
word_labels = [word]
color_list = ['red']
# 拿到词的embedding
arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
# 拿到最相似的embedding
close_words = model.wv.most_similar([word])
# 把相似词的embedding
for wrd_score in close_words:
wrd_vector = model.wv.__getitem__([wrd_score[0]])
word_labels.append(wrd_score[0])
color_list.append('blue')
arrays = np.append(arrays, wrd_vector, axis=0)
# adds the vector for each of the words from list_names to the array
for wrd in list_names:
wrd_vector = model.wv.__getitem__([wrd])
word_labels.append(wrd)
color_list.append('green')
arrays = np.append(arrays, wrd_vector, axis=0)
# 使用pca从300维降到18维
reduc = PCA(n_components=18).fit_transform(arrays)
# Finds t-SNE coordinates for 2 dimensions
np.set_printoptions(suppress=True)
#使用TSNE在pca降维的基础上,继续降到2维,这样做是为了节省时间
Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
# 把降维后的数据放到一个df表中
df = pd.DataFrame({'x': [x for x in Y[:, 0]],
'y': [y for y in Y[:, 1]],
'words': word_labels,
'color': color_list})
fig, _ = plt.subplots()
fig.set_size_inches(9, 9)
p1 = sns.regplot(data=df,
x="x",
y="y",
fit_reg=False,
marker="o",
scatter_kws={'s': 40,
'facecolors': df['color']
}
)
for line in range(0, df.shape[0]):
p1.text(df["x"][line],
df['y'][line],
' ' + df["words"][line].title(),
horizontalalignment='left',
verticalalignment='bottom', size='medium',
color=df['color'][line],
weight='normal'
).set_size(15)
plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
plt.title('t-SNE visualization for {}'.format(word.title()))
tsnescatterplot(w2v_model, 'homer', ['dog', 'bird', 'ah', 'maude', 'bob', 'mel', 'apu', 'duff'])