欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

贝叶斯原理做最大似然估计实现拼写纠错

程序员文章站 2022-06-26 20:44:46
import numpy as npimport jieba import pandas as pdimport refrom itertools import permutationsimport timefrom sys import exc_info,stdoutimport tracebackfrom os import getcwdimport osdef shorterror(func): def In(*vars): try :...
import numpy as np
import jieba 
import pandas as pd
import re
from itertools import permutations
import time
from sys import exc_info,stdout
import traceback
from os import getcwd
import os


def shorterror(func):
    def In(*vars):
        try :
            return func (*vars),
        except Exception as e :
            exc_type, exc_value, exc_traceback_obj = exc_info()
            traceback.print_exception(exc_type, exc_value, exc_traceback_obj, limit=2, file=stdout)
            print("exc_type: %s" % exc_type)
            print("exc_value: %s" % exc_value)
            print("exc_traceback_obj: %s" % exc_traceback_obj)
    return In

def longerror(func):
    def In(*vars):
        try :
            return func (*vars),
        except Exception as e :
            import cgitb
            cgitb.enable ( format = 'text' )
        return func (*vars),
    return In

def calltime(func):
    def In(*varc):
        start = time.process_time()
        func(*varc)
        print('The Function',func.__name__,'Takes Time To Run :',time.process_time() - start,'Seconds')
        return func(*varc)
    return In



symbles=''':,"{[}](>)</\n。●  ,、的 啊 好 和
并 与 及 对 错 你 我 我们 她 他 它:: ; ;《 》
1 2 3 4 5 6 7 8 9 0  ‘ “ ” ’ + - * / ` ~ 
\( \ [ \ { \ } ] ) ( )【 \xa0 】理想 愿景
工 不管 只要 一员 大家庭 当成 作 帅哥 美女 年轻
佛系
'''
#删除停词
def del_stop_word(strings,symbles=symbles):
    srcrep = {i:'' for i in symbles }
    rep = dict((re.escape(k), v) for k, v in srcrep.items())
    pattern = re.compile("|".join(rep.keys()))
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)
    
#读取文档
def read_txt(path):
    return open(path,'r').read()
 
#只要中文
def just_chinese(string, resymbol=""):
    return re.sub(u"([^\u4e00-\u9fa5])", resymbol, string)

#分词
def split_world(corpus):
    return np.array(list(jieba.cut(just_chinese(read_txt(corpus)))))

#整理成词典
def word_dict_func(corpus,log=False):
    word_list = split_world(corpus)
    m = np.count_nonzero(word_list)
    kind,count = np.unique(word_list,return_counts=True)
    if log:
        prob = -np.log(count/m)
    else:
        prob = count/m
    return dict(zip(kind,prob))


# 加载本地词典
#只要数字
def just_number(string, resymbol=""):
    sub_str = re.sub(u"([^\u0030-\u0039])", resymbol, string)
    return sub_str

#sigmod预留函数,转概率空前备用
def sigmod(z):
    return 1/(1+np.exp(-z))

#过滤字词语频率生成字典
def filter_dict(words,numbers):
    word_dict ,expr_sum = dict(),0
    for word,num in zip(words,numbers):
        try:
            number = float(just_number(num))
            expr = {word:number}  
        except Exception:
            number = 0
        finally:
            expr_sum += number
            word_dict.update(expr)
    prob = {word:word_dict[word]/expr_sum for word in word_dict}
    return prob

#加载本地字典入口函数
def location_dict(dir_path):
    init_dict = np.zeros((2,2))
    for path in os.listdir(dir_path):
        try:
            file_path = "{}{}".format(dir_path,path)
            file_of_one = np.loadtxt(file_path,delimiter='\t',dtype=str)
            print("ok:",path)
        except Exception as error:
            file_of_one = np.array([line.replace("\n","").split("\t") for line in open(file_path,'r').readlines()])
            print("error:",path,error)
        finally :
            init_dict = np.r_[init_dict,file_of_one]
    words,numbers = init_dict[2:,0],init_dict[2:,1]
    return filter_dict(words,numbers)



#生成单词补充模块
#创建补充单词字典
def create_char_map(str_range = 'lowercase',chinese_path = False):
    iter_range = lambda char_range : map(lambda x : chr(x),char_range)
    func_dict = {'lowercase':iter_range(range(97,122))
                ,'uppercase':iter_range(range(65,90))
                ,'numbers':iter_range(range(48,57))
                ,'chinese':tuple(set(just_chinese(read_txt(corpus))))}        
    return func_dict[str_range]

#展开拼接字符组合
def collate_char_iterator(itertools_perm):
    return map(lambda x:"".join(x),itertools_perm)

#字符生成器
def chargen(language="lowercase",n=1):
    return collate_char_iterator(permutations(create_char_map(language),n))

#编辑距离添加
def add_char(input_char,language="lowercase",n=2,forward=True):
    return ("{}{}".format(char,input_char) 
        if forward==True else "{}{}".format(input_char,char) 
        for char in chargen(language=language,n=n))
#编辑距离替换
def replace_char(input_char,language="lowercase",n=2):
    m = len(input_char)
    S = chargen(language=language,n=n)
    for create_str in S:
        for i in range(m):
            result = yield input_char.replace(input_char[i:i+n],create_str)

#批量字符串删除函数
def delete_element(strings,symbles=symbles):
    srcrep = {i:'' for i in symbles }
    rep = dict((re.escape(k), v) for k, v in srcrep.items())
    pattern = re.compile("|".join(rep.keys()))
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)

#编辑距离删除
def delete_char(input_char,language="lowercase",n=2):
    return (delete_element(input_char,"".join(chars)) for chars in permutations(input_char,n))

#编辑距离生成
def translation_str(input_char,language="lowercase",n=2):
    del_ = delete_char(input_char,language=language,n=n)
    replace_ = replace_char(input_char,language=language,n=n)
    add_forward = add_char(input_char,language=language,n=n,forward=True)
    add_backward = add_char(input_char,language=language,n=n,forward=False)
    return tuple(list(del_)+list(replace_)+list(add_forward)+list(add_backward))

#批量编辑距离生成
def translation_n(input_char,language="lowercase",n=2):
    result = []
    for i in range(1,n+1):
        result += list(translation_str(input_char,language=language,n=i))
    return np.array(result)


#拼写纠错模块
@calltime
def check_str(input_char,word_dict=False,error_dict=False):
    prob_dict = dict()
    if word_dict:
        word_dict = word_dict
    else:
        word_dict = word_dict_func(corpus,log=False)
    if input_char in word_dict:
        check = filter(lambda word : len(word) > 0,translation_n(input_char,language="chinese",n=1))
        Pc = word_dict[input_char] 
        for sc_element in check:
            if sc_element in error_dict:
                Psc = error_dict[sc_element]
                bayes_ = Psc*Pc
                expr = { bayes_ : sc_element }
                prob_dict.update(expr)
        Eword = prob_dict[max(prob_dict)]
        return {"EM":Eword,"D":prob_dict,"C":Pc,"bayes":bayes_}
    return input_char
    

# 静态配置项
corpus = "/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/NLP数据集合/豆瓣电影数据集(2019.3)/豆瓣电影简介.txt"
dir_path = "/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/NLP数据集合/词库/chinese/"
example_error = location_dict(dir_path)
example_error
word_dict = word_dict_func(corpus,log=False)

# 测试运行
test = check_str(input(),word_dict,example_error)
EMword , D , C , bayes = test["EM"],test['D'],test['C'],test['bayes']
EMword 


本文地址:https://blog.csdn.net/weixin_43069769/article/details/107655775

相关标签: python代码整合