贝叶斯原理做最大似然估计实现拼写纠错
程序员文章站
2022-06-26 20:44:46
import numpy as npimport jieba import pandas as pdimport refrom itertools import permutationsimport timefrom sys import exc_info,stdoutimport tracebackfrom os import getcwdimport osdef shorterror(func): def In(*vars): try :...
import numpy as np
import jieba
import pandas as pd
import re
from itertools import permutations
import time
from sys import exc_info,stdout
import traceback
from os import getcwd
import os
def shorterror(func):
def In(*vars):
try :
return func (*vars),
except Exception as e :
exc_type, exc_value, exc_traceback_obj = exc_info()
traceback.print_exception(exc_type, exc_value, exc_traceback_obj, limit=2, file=stdout)
print("exc_type: %s" % exc_type)
print("exc_value: %s" % exc_value)
print("exc_traceback_obj: %s" % exc_traceback_obj)
return In
def longerror(func):
def In(*vars):
try :
return func (*vars),
except Exception as e :
import cgitb
cgitb.enable ( format = 'text' )
return func (*vars),
return In
def calltime(func):
def In(*varc):
start = time.process_time()
func(*varc)
print('The Function',func.__name__,'Takes Time To Run :',time.process_time() - start,'Seconds')
return func(*varc)
return In
symbles=''':,"{[}](>)</\n。● ,、的 啊 好 和
并 与 及 对 错 你 我 我们 她 他 它:: ; ;《 》
1 2 3 4 5 6 7 8 9 0 ‘ “ ” ’ + - * / ` ~
\( \ [ \ { \ } ] ) ( )【 \xa0 】理想 愿景
工 不管 只要 一员 大家庭 当成 作 帅哥 美女 年轻
佛系
'''
#删除停词
def del_stop_word(strings,symbles=symbles):
srcrep = {i:'' for i in symbles }
rep = dict((re.escape(k), v) for k, v in srcrep.items())
pattern = re.compile("|".join(rep.keys()))
return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)
#读取文档
def read_txt(path):
return open(path,'r').read()
#只要中文
def just_chinese(string, resymbol=""):
return re.sub(u"([^\u4e00-\u9fa5])", resymbol, string)
#分词
def split_world(corpus):
return np.array(list(jieba.cut(just_chinese(read_txt(corpus)))))
#整理成词典
def word_dict_func(corpus,log=False):
word_list = split_world(corpus)
m = np.count_nonzero(word_list)
kind,count = np.unique(word_list,return_counts=True)
if log:
prob = -np.log(count/m)
else:
prob = count/m
return dict(zip(kind,prob))
# 加载本地词典
#只要数字
def just_number(string, resymbol=""):
sub_str = re.sub(u"([^\u0030-\u0039])", resymbol, string)
return sub_str
#sigmod预留函数,转概率空前备用
def sigmod(z):
return 1/(1+np.exp(-z))
#过滤字词语频率生成字典
def filter_dict(words,numbers):
word_dict ,expr_sum = dict(),0
for word,num in zip(words,numbers):
try:
number = float(just_number(num))
expr = {word:number}
except Exception:
number = 0
finally:
expr_sum += number
word_dict.update(expr)
prob = {word:word_dict[word]/expr_sum for word in word_dict}
return prob
#加载本地字典入口函数
def location_dict(dir_path):
init_dict = np.zeros((2,2))
for path in os.listdir(dir_path):
try:
file_path = "{}{}".format(dir_path,path)
file_of_one = np.loadtxt(file_path,delimiter='\t',dtype=str)
print("ok:",path)
except Exception as error:
file_of_one = np.array([line.replace("\n","").split("\t") for line in open(file_path,'r').readlines()])
print("error:",path,error)
finally :
init_dict = np.r_[init_dict,file_of_one]
words,numbers = init_dict[2:,0],init_dict[2:,1]
return filter_dict(words,numbers)
#生成单词补充模块
#创建补充单词字典
def create_char_map(str_range = 'lowercase',chinese_path = False):
iter_range = lambda char_range : map(lambda x : chr(x),char_range)
func_dict = {'lowercase':iter_range(range(97,122))
,'uppercase':iter_range(range(65,90))
,'numbers':iter_range(range(48,57))
,'chinese':tuple(set(just_chinese(read_txt(corpus))))}
return func_dict[str_range]
#展开拼接字符组合
def collate_char_iterator(itertools_perm):
return map(lambda x:"".join(x),itertools_perm)
#字符生成器
def chargen(language="lowercase",n=1):
return collate_char_iterator(permutations(create_char_map(language),n))
#编辑距离添加
def add_char(input_char,language="lowercase",n=2,forward=True):
return ("{}{}".format(char,input_char)
if forward==True else "{}{}".format(input_char,char)
for char in chargen(language=language,n=n))
#编辑距离替换
def replace_char(input_char,language="lowercase",n=2):
m = len(input_char)
S = chargen(language=language,n=n)
for create_str in S:
for i in range(m):
result = yield input_char.replace(input_char[i:i+n],create_str)
#批量字符串删除函数
def delete_element(strings,symbles=symbles):
srcrep = {i:'' for i in symbles }
rep = dict((re.escape(k), v) for k, v in srcrep.items())
pattern = re.compile("|".join(rep.keys()))
return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)
#编辑距离删除
def delete_char(input_char,language="lowercase",n=2):
return (delete_element(input_char,"".join(chars)) for chars in permutations(input_char,n))
#编辑距离生成
def translation_str(input_char,language="lowercase",n=2):
del_ = delete_char(input_char,language=language,n=n)
replace_ = replace_char(input_char,language=language,n=n)
add_forward = add_char(input_char,language=language,n=n,forward=True)
add_backward = add_char(input_char,language=language,n=n,forward=False)
return tuple(list(del_)+list(replace_)+list(add_forward)+list(add_backward))
#批量编辑距离生成
def translation_n(input_char,language="lowercase",n=2):
result = []
for i in range(1,n+1):
result += list(translation_str(input_char,language=language,n=i))
return np.array(result)
#拼写纠错模块
@calltime
def check_str(input_char,word_dict=False,error_dict=False):
prob_dict = dict()
if word_dict:
word_dict = word_dict
else:
word_dict = word_dict_func(corpus,log=False)
if input_char in word_dict:
check = filter(lambda word : len(word) > 0,translation_n(input_char,language="chinese",n=1))
Pc = word_dict[input_char]
for sc_element in check:
if sc_element in error_dict:
Psc = error_dict[sc_element]
bayes_ = Psc*Pc
expr = { bayes_ : sc_element }
prob_dict.update(expr)
Eword = prob_dict[max(prob_dict)]
return {"EM":Eword,"D":prob_dict,"C":Pc,"bayes":bayes_}
return input_char
# 静态配置项
corpus = "/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/NLP数据集合/豆瓣电影数据集(2019.3)/豆瓣电影简介.txt"
dir_path = "/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/NLP数据集合/词库/chinese/"
example_error = location_dict(dir_path)
example_error
word_dict = word_dict_func(corpus,log=False)
# 测试运行
test = check_str(input(),word_dict,example_error)
EMword , D , C , bayes = test["EM"],test['D'],test['C'],test['bayes']
EMword
本文地址:https://blog.csdn.net/weixin_43069769/article/details/107655775
下一篇: 还没把那姑娘骗到手