双向最大匹配法分词
程序员文章站
2022-07-15 17:31:18
...
# -*- coding: utf-8 -*-
class BiMM():
def __init__(self):
self.window_size = 3#字典中最长词数
def MMseg(self, text):
result = []
index = 0
text_length = len(text)
dict = ['研究', '研究生', '生命', '命', '的', '起源']
while text_length > index:
for size in range(self.window_size+index, index, -1):
piece = text[index:size]
if piece in dict:
index = size - 1
break
index += 1
result.append(piece)
return result
def RMMseg(self, text):
result = []
index = len(text)
dict = ['研究', '研究生', '生命', '命', '的', '起源']
while index > 0:
for size in range(index-self.window_size, index):
piece = text[size:index]
if piece in dict:
index = size + 1
break
index = index - 1
result.append(piece)
result.reverse()
return result
def main(self, text, r1, r2):
if len(r1) > len(r2):#正向、逆向分词结果词数不同,取分词数量少的
print(r2)
elif len(r1) < len(r2):
print(r1)
else:#词数相同
num1 = len(list(filter(lambda s: isinstance(s, str) and len(s) == 1, r1)))#filter()用于过滤,提取列表中长度为1的字符
num2 = len(list(filter(lambda s: isinstance(s, str) and len(s) == 1, r2)))
if num1 == num2:
print(r1)#分词结果相同,可返回任意一个
elif num1 > num2:#结果不同,返回单字个数少的
print(r2)
else:
print(r1)
if __name__ == '__main__':
text = '研究生命的起源'
tokenizer = BiMM()
r1 = tokenizer.MMseg(text)#['研究生', '命', '的', '起源']
r2 = tokenizer.RMMseg(text)#['研究', '生命', '的', '起源']
tokenizer.main(text, r1, r2)#['研究', '生命', '的', '起源']