# Python rstrip() 删除 string 字符串末尾的指定字符(默认为空格)
vocab = set([line.rstrip() for line in open('vocab.txt')])
需要生成 所有候选集合
def generate_candidates(word):
word: 给定的输入(错误的输入)
# 生成编辑距离为1的单词
# 1.insert 2. delete 3. replace
# appl: replace: bppl, cppl......
# insert: bappl, cappl....
# delete: ppl, apl, app......
# 假设适用26个字符(意思是无法预测用户是否想输入其他字符,
# 如标点符号;但是我们预测用户可能将其他英文字母输错成为标点符号)
letters = "abcdefghijklmnopqrstuvwxyz"
splits =[(word[:i], word[i:]) for i in range(len(word) + 1)]
# insert 操作
inserts = [*L+c+R for L, R in splits for c in letters]*
# delete 操作
deletes = [L+R[1:] for L, R in splits if R]
# replace 操作
replaces = [L+c+R[1:] for L, R in splits if R for c in letters]
candidates = set(inserts+deletes+replaces)
# 过滤掉不存在于词典库里面的单词
return [word for word in candidates if word in vocab]
import nltk
from nltk.corpus import reuters
nltk.download('reuters') # 没有该语料库就下载
categories = reuters.categories()
corpus = reuters.sents(categories = categories)
term_count = {}
bigram_count = {}
for doc in corpus:
doc = ['<s>'] + doc
for i in range(len(doc) - 1):
# bigram: [i, i+1]
term = doc[i]
bigram = doc[i: i+2]
if term in term_count:
term_count[term] += 1
term_count[term] = 1
bigram = ' '.join(bigram)
if bigram in bigram_count:
bigram_count[bigram] += 1
bigram_count[bigram] = 1
# print(bigram_count)
# sklearn里面有现成的包
用户打错的概率统计 - channel probalility
channel_prob = {}
for line in open('spell-errors.txt'):
items = line.split(':')
correct = items[0].strip()
mistakes = [item.strip() for item in items[1].strip().split(',')]
channel_prob[correct] = {}
for mis in mistakes:
# 当用户打到这个正确的词的时候,它可能打错成其他词的概率,这是个条件概率,每个词概率相同。。。。?
channel_prob[correct][mis] = 1.0 / len(mistakes)
import numpy as np
V = len(term_count.keys())
file = open('testdata.txt', 'r')
for line in file:
items = line.rstrip().split('\t')
line = items[2][:-1].split() # 要去掉句末的标点符号!!!!
# ['I', 'like', 'playing']
for word in line:
# 消除标点符号的干扰
other = [",", "'"]
if word[-1] in other:
word = word[:-1]
elif word[-2:] in ["'s", "'t"]:
word = word[:-2]
if word not in vocab:
# 需要替换word成正确的单词
# Step1 : 生成所有的(valid)候选集和
candidates = generate_candidates(word)
# 一种方式:if candidates = [] , 多生成几个candidates,比如生成编辑距离不大于2的
# TODO: 根据条件生成更多的候选集和
# 这种情况意思是 通过两步改变该字符串,仍然无法生成我们认为正确的单词!!!
if len(candidates) < 1:
continue # 不建议这么做(这是不对的!!!)
probs = []
# 对于每一个candidate,计算它的score
# score = p(correct) * p(mistake|correct)
# = log p(correct) + log p(mistake|correct)
# 返回score最大的candidate
for candi in candidates:
prob = 0
# a. 计算channel probability
# 如果候选词(正确的词)在可能输错词表中有 该错词
if candi in channel_prob and word in channel_prob[candi]:
prob += np.log(channel_prob[candi][word])
# 没有的话就给它个超级小的概率,但是反正我们就认为用户确实输错了!
prob += np.log(0.0001) # 实际不能这么平滑,这里是为了快捷!
# b. 计算语言模型的概率
# 我认为老师在这句话胡说八道,根本不用加1,后面的idx全部+1复原了
# 程序根本没有经过这里,这里全部写错了。下面是我修改之后的!!
# idx = items[2].index(word)+1
# if items[2][idx - 1] in bigram_count and candi in bigram_count[items[2][idx - 1]]:
# prob += np.log((bigram_count[items[2][idx - 1]][candi] + 1.0) / (
# term_count[bigram_count[items[2][idx - 1]]] + V))
idx = items[2].index(word) + 1 # 默认有个<s>符号,所以加1
if items[2][idx-1: idx+1] in bigram_count and candi in bigram_count[items[2][idx-1:idx+1]]:
prob += np.log((bigram_count[items[2][idx-1: idx+1]] + 1.0) / # ????
(term_count[items[2][idx - 1]] + V))
# TODO: 也要考虑当前 [word,post_word]
# prob += np.log(bigram概率)
prob += np.log(1.0 / V)
max_idx = probs.index(max(probs))
print(word, candidates[max_idx])
inverted_index = {} # 倒排表的建立可以用的数据结构
