欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

项目:用朴素贝叶斯分类,来判断文本判断*工作报告 OR 哈利波特小说

程序员文章站 2022-04-02 11:09:34
目的:判断*工作报告 OR 哈利波特小说。数据来源:互联网下载代码项目:判断*工作报告OR哈利波特小说# 引入朴素贝叶斯import bayesfrom bayes import *# 读取数据docList=[]; classList = []; fullText =[]for i in range(1,10+1): wordList = textParse2(open('my_file/report/spam/%d.txt' % i,encoding='UTF-8'...

目的:

用朴素贝叶斯分类,来判断文本是属于*工作报告 OR 哈利波特小说。

数据来源:

来自于互联网下载。
*工作报告放在spam文件夹中,分类为1;哈利波特小说放在ham文件夹中,分类为0。测试文本放在test文件夹中,未标记分类。
分享链接:https://pan.baidu.com/s/1fjbQO19StRy8UspZFJsdAQ
提取码:pypy

代码

项目:判断*工作报告OR哈利波特小说

# 引入朴素贝叶斯
import bayes
from bayes import *

# 读取数据
docList=[]; classList = []; fullText =[]
for i in range(1,10+1):
    wordList = textParse2(open('my_file/report/spam/%d.txt' % i,encoding='UTF-8',errors='ignore').read())
    docList.append(wordList)
    fullText.extend(wordList)
    classList.append(1)
    wordList = textParse2(open('my_file/report/ham/%d.txt' % i,encoding='UTF-8',errors='ignore').read())
    docList.append(wordList)
    fullText.extend(wordList)
    classList.append(0)

print(len(docList))
#20
# 读取样本数据(否则测试新样本数据时会显示失败,因为没有读过这些词)
for i in range(1,3+1):
    wordList = textParse2(open('my_file/report/test/%d.txt' % i,encoding='UTF-8',errors='ignore').read())
    docList.append(wordList)
    fullText.extend(wordList)

print(len(docList))
#23
# 检查读取的数据是正常的
print(docList[0][:10])
print(classList[:10])
#['*', '工作', '报告', '2020', '22', '第十三届', '全国人民代表大会', '第三次', '会议', '国务院']
#[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
# 分割训练数据和测试数据
vocabList = createVocabList(docList)#create vocabulary
trainingSet = list(range(20)); testSet=[]           #create test set
for i in range(4):
    randIndex = int(random.uniform(0,len(trainingSet)))
    testSet.append(trainingSet[randIndex])
    del(trainingSet[randIndex])
    
print(testSet)
print(trainingSet)
#[16, 0, 4, 10]
#[1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19]

# 用训练数据来训练贝叶斯模型
trainMat=[]; trainClasses = []
for docIndex in trainingSet:#train the classifier (get probs) trainNB0
    trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
    trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))

# 打印并检查数据
print(p0V)
print(p1V)
print(pSpam)
#[-10.17888194 -10.17888194  -9.08026965 ... -10.17888194  -9.48573476 -10.17888194]
#[-10.61304927  -9.91990209 -10.61304927 ...  -9.91990209 -10.61304927  -9.91990209]
#0.375
# 交叉验证来测试分类器效果
errorCount = 0
for docIndex in testSet:        #classify the remaining items
    wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
    if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
        errorCount += 1
        print ("classification error",docList[docIndex])
    else:
        print("classification:",classifyNB(array(wordVector),p0V,p1V,pSpam),"real:",classList[docIndex])
print ('the error rate is: ',float(errorCount)/len(testSet))
#classification: 1 real: 1
#classification: 1 real: 1
#classification: 1 real: 1
#classification: 1 real: 1
#the error rate is:  0.0
# 用新数据来检测分类器效果0
def classification_test(file,docList,p0V,p1V,pSpam):
    wordList = textParse2(open(file,encoding='UTF-8',errors='ignore').read())
    #docList.append(wordList)
    vocabList = createVocabList(docList)#create vocabulary
    wordVector = bagOfWords2VecMN(vocabList, wordList)
    output = classifyNB(array(wordVector),p0V,p1V,pSpam)
    print(file,'分类结果是',output)
    return output

# 用新数据来检测分类器效果1
classification_test('my_file/report/test/1.txt',docList,p0V,p1V,pSpam)
#my_file/report/test/1.txt 分类结果是 1

# 用新数据来检测分类器效果2
classification_test('my_file/report/test/2.txt',docList,p0V,p1V,pSpam)
#my_file/report/test/2.txt 分类结果是 0

# 用新数据来检测分类器效果3
classification_test('my_file/report/test/3.txt',docList,p0V,p1V,pSpam)
#my_file/report/test/3.txt 分类结果是 0

bayes.py


from numpy import *
                 
def createVocabList(dataSet):
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print ("the word: %s is not in my Vocabulary!" % word)
    return returnVec

def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = ones(numWords); p1Num = ones(numWords)      #change to ones() 
    p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)          #change to log()
    p0Vect = log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0
    
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

    
def textParse2(bigString):    #input is big string, #output is word list
    import re
    import jieba
    listOfTokens = jieba.cut(bigString,cut_all=False)
    return [tok for tok in listOfTokens if len(tok) >= 2] 

本文地址:https://blog.csdn.net/m0_46629123/article/details/110441305