项目:用朴素贝叶斯分类,来判断文本判断*工作报告 OR 哈利波特小说
程序员文章站
2022-07-08 12:22:43
目的:判断*工作报告 OR 哈利波特小说。数据来源:互联网下载代码项目:判断*工作报告OR哈利波特小说# 引入朴素贝叶斯import bayesfrom bayes import *# 读取数据docList=[]; classList = []; fullText =[]for i in range(1,10+1): wordList = textParse2(open('my_file/report/spam/%d.txt' % i,encoding='UTF-8'...
目的:
用朴素贝叶斯分类,来判断文本是属于*工作报告 OR 哈利波特小说。
数据来源:
来自于互联网下载。
*工作报告放在spam文件夹中,分类为1;哈利波特小说放在ham文件夹中,分类为0。测试文本放在test文件夹中,未标记分类。
分享链接:https://pan.baidu.com/s/1fjbQO19StRy8UspZFJsdAQ
提取码:pypy
代码
项目:判断*工作报告OR哈利波特小说
# 引入朴素贝叶斯
import bayes
from bayes import *
# 读取数据
docList=[]; classList = []; fullText =[]
for i in range(1,10+1):
wordList = textParse2(open('my_file/report/spam/%d.txt' % i,encoding='UTF-8',errors='ignore').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse2(open('my_file/report/ham/%d.txt' % i,encoding='UTF-8',errors='ignore').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
print(len(docList))
#20
# 读取样本数据(否则测试新样本数据时会显示失败,因为没有读过这些词)
for i in range(1,3+1):
wordList = textParse2(open('my_file/report/test/%d.txt' % i,encoding='UTF-8',errors='ignore').read())
docList.append(wordList)
fullText.extend(wordList)
print(len(docList))
#23
# 检查读取的数据是正常的
print(docList[0][:10])
print(classList[:10])
#['*', '工作', '报告', '2020', '22', '第十三届', '全国人民代表大会', '第三次', '会议', '国务院']
#[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
# 分割训练数据和测试数据
vocabList = createVocabList(docList)#create vocabulary
trainingSet = list(range(20)); testSet=[] #create test set
for i in range(4):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
print(testSet)
print(trainingSet)
#[16, 0, 4, 10]
#[1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19]
# 用训练数据来训练贝叶斯模型
trainMat=[]; trainClasses = []
for docIndex in trainingSet:#train the classifier (get probs) trainNB0
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
# 打印并检查数据
print(p0V)
print(p1V)
print(pSpam)
#[-10.17888194 -10.17888194 -9.08026965 ... -10.17888194 -9.48573476 -10.17888194]
#[-10.61304927 -9.91990209 -10.61304927 ... -9.91990209 -10.61304927 -9.91990209]
#0.375
# 交叉验证来测试分类器效果
errorCount = 0
for docIndex in testSet: #classify the remaining items
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print ("classification error",docList[docIndex])
else:
print("classification:",classifyNB(array(wordVector),p0V,p1V,pSpam),"real:",classList[docIndex])
print ('the error rate is: ',float(errorCount)/len(testSet))
#classification: 1 real: 1
#classification: 1 real: 1
#classification: 1 real: 1
#classification: 1 real: 1
#the error rate is: 0.0
# 用新数据来检测分类器效果0
def classification_test(file,docList,p0V,p1V,pSpam):
wordList = textParse2(open(file,encoding='UTF-8',errors='ignore').read())
#docList.append(wordList)
vocabList = createVocabList(docList)#create vocabulary
wordVector = bagOfWords2VecMN(vocabList, wordList)
output = classifyNB(array(wordVector),p0V,p1V,pSpam)
print(file,'分类结果是',output)
return output
# 用新数据来检测分类器效果1
classification_test('my_file/report/test/1.txt',docList,p0V,p1V,pSpam)
#my_file/report/test/1.txt 分类结果是 1
# 用新数据来检测分类器效果2
classification_test('my_file/report/test/2.txt',docList,p0V,p1V,pSpam)
#my_file/report/test/2.txt 分类结果是 0
# 用新数据来检测分类器效果3
classification_test('my_file/report/test/3.txt',docList,p0V,p1V,pSpam)
#my_file/report/test/3.txt 分类结果是 0
bayes.py
from numpy import *
def createVocabList(dataSet):
vocabSet = set([]) #create empty set
for document in dataSet:
vocabSet = vocabSet | set(document) #union of the two sets
return list(vocabSet)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else: print ("the word: %s is not in my Vocabulary!" % word)
return returnVec
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs)
p0Num = ones(numWords); p1Num = ones(numWords) #change to ones()
p0Denom = 2.0; p1Denom = 2.0 #change to 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom) #change to log()
p0Vect = log(p0Num/p0Denom) #change to log()
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
def textParse2(bigString): #input is big string, #output is word list
import re
import jieba
listOfTokens = jieba.cut(bigString,cut_all=False)
return [tok for tok in listOfTokens if len(tok) >= 2]
本文地址:https://blog.csdn.net/m0_46629123/article/details/110441305