机器学习——逻辑回归多分类
程序员文章站
2023-12-05 22:46:52
本期主要讲利用逻辑回归来做多分类,包括数据可视化,假设函数,损失函数,参数最优化,一对多分类训练器,模型准确率评估等,对应吴恩达机器学习第四周编程练习,融入自己的想法。...
本期主要讲利用逻辑回归来做多分类,包括数据可视化,假设函数,损失函数,参数最优化,一对多分类训练器,模型准确率评估等,对应吴恩达机器学习第四周编程练习,融入自己的想法。
- 读取数据
原数据是一个5000个样本的1-10的数字图片,为matlab格式,维度是5000400,其中400个是一个2020像素的数字展平的后放一行。
%matplotlib inline
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import cv2 as cv
#import scipy.misc #Used to show matrix as an image
import matplotlib.cm as cm #Used to display images in a specific colormap
from scipy.special import expit #导入logistic函数
import scipy.io as sio #调用scipy模块读取matlab的文件
data = sio.loadmat("CourseraML/ex3/data/ex3data1.mat") #读取数据
X, y = data["X"], data["y"] #特征和标签
X = np.insert(X, 0, 1, axis =1) #插入截距虚拟列
print(X.shape, y.shape)#(5000, 401) (5000, 1)
print(np.unique(y)) #[ 1 2 3 4 5 6 7 8 9 10]
- 数据可视化
def getDataImg(row): #构造20*20=400像素矩阵
width, height = 20, 20
square = row[1:].reshape(width, height)
return square.T
def dataDisplay(indices_to_display = None): #定义数据图像显示函数
width, height = 20, 20
nrows, ncols = 10, 10
if not indices_to_display:
indices_to_display = random.sample(range(X.shape[0]), nrows*ncols) #从50000个随机抽取10*10=100个
big_picture = np.zeros((height* nrows, width*ncols)) #200*200像素模板
irow, icol = 0, 0
for idx in indices_to_display:
if icol == ncols:
irow +=1
icol = 0
iimg = getDataImg(X[idx]) #生成20*20像素的小图
big_picture[irow*height:irow*height+iimg.shape[0], icol*width:icol*width+iimg.shape[1]] = iimg #模板定位放置小图
icol +=1
fig = plt.figure(figsize = (6, 6)) #新建画布
#img = Image.fromarray(np.uint8(big_picture), mode = "L") #图像模糊
img = Image.fromarray((big_picture * 5).astype('uint8'),mode = "L") #清晰图像,通过系数5调节清晰度
plt.imshow(img,cmap = cm.Greys_r)
dataDisplay()
- 假设函数和损失函数
def h(mytheta, myX): #定义假设函数
return expit(np.dot(myX, mytheta)) #5000*1
def costFunction(mytheta, myX, myy, mylambda = 0): #定义损失函数
m = myX.shape[0] #样本量5000
myh = h(mytheta, myX) #5000*1
term1 = np.log(myh).dot(-myy.T) #5000*5000
term2 = np.log(1-myh).dot(1-myy.T) #5000*5000
term3 = mytheta.T.dot((mytheta))*mylambda/(2*m) #正则项 1*1
return (term1-term2)/m +term3 #5000*5000+1*1 broadcast技术
initial_theta = np.zeros((X.shape[1], 1)) #初始化theta值401*1
#print(costFunction(initial_theta, X, y)) #(5000, 5000)
- 梯度与最优化
from scipy import optimize
def costGradient(mytheta, myX, myy, mylambda = 0.): #定义梯度下降函数
m = myX.shape[0] #
beta = h(mytheta, myX)-myy.T #shape: (5000,5000)
regterm = mytheta[1:]*(mylambda/m) #shape: (400,1)
grad = (1./m)*np.dot(myX.T, beta) #shape: (401, 5000)
grad[1:] = grad[1:] + regterm
return grad #shape: (401, 5000)
def optimizeTheta(mytheta,myX,myy,mylambda=0.):
result = optimize.fmin_cg(costFunction, fprime=costGradient, x0=mytheta, \
args=(myX, myy, mylambda), maxiter=50, disp=False,\
full_output=True) #返回参数theta和最小损失函数值
return result[0], result[1]
- 一对多训练器
def oneVsAll(myX, myy, labels_num, mylambda): #定义1对多分类训练器
all_theta = np.zeros((labels_num, X.shape[1])) #存放10个假设函数的参数,10*401
initial_theta = np.zeros(X.shape[1]) #初始化参数值 401*1
for i in range(labels_num):
print('Optimizing for handwritten number {}...'.format(i))
iclass = i if i else 10 #数字0 属于第10个类别
logic_Y = np.array([1 if x==iclass else 0 for x in y]) #设置每次2分类的新标签 5000*1
itheta, imincost = optimizeTheta(initial_theta, X, logic_Y, mylambda)
all_theta[i,:] = itheta #401*1
return all_theta #返回十次2分类的 最优化参数
- 预测与正确率评估
def predict_one_vs_all(myX, all_theta): #多分类预测
hypots = h(all_theta.T, myX) #假设函数值5000*10,每一行有10个假设函数值
h_argmax = np.argmax(hypots, axis =1) #每行找出最大假设函数的序号
return h_argmax
all_theta = oneVsAll(X, y, 10, 1) #5000*401
y_pred = predict_one_vs_all(X, all_theta)#5000*1
y_pred = [x if x else 10 for x in y_pred]
#print(y_pred) #预测值类别
#print(list(y.reshape(-1))) #真是类别
n_correct, n_total = 0, 0 #正确数,总数
for row in range(len(y_pred)):
n_total +=1
if y_pred[row] == y[row]:
n_correct +=1 #正确数加1
accuarcy = np.round(n_correct/n_total,2)
print("The accuarcy is {}%".format(accuarcy*100))
- 输出
最后输出大约是这样子的,正确率在94%左右。
Optimizing for handwritten number 0...
Optimizing for handwritten number 1...
Optimizing for handwritten number 2...
Optimizing for handwritten number 3...
Optimizing for handwritten number 4...
Optimizing for handwritten number 5...
Optimizing for handwritten number 6...
Optimizing for handwritten number 7...
Optimizing for handwritten number 8...
Optimizing for handwritten number 9...
The accuarcy is 94.0%
本文地址:https://blog.csdn.net/zengbowengood/article/details/107442085
上一篇: 5分钟带你了解知乎的运营规则
下一篇: C#使用委托实现的快速排序算法实例