手写数字识别
程序员文章站
2022-03-28 13:57:52
...
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_lfw_people
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
人脸识别
# fig=plt.figure(figsize=(8,4))
fig,axes=plt.subplots(4,5,figsize=(8,4),subplot_kw={"xticks":[],"yticks":[]})
# fig 画布
# axes对象
for i,ax in enumerate(axes.flat):
ax.imshow(faces.images[i,:,:],cmap="gray")
axes.shape
(4, 5)
axes.flat
# 二维转一维
<numpy.flatiter at 0x1d2351a8080>
for i,ax in enumerate(axes.flat):
ax.imshow(faces.images[i,:,:],cmap="gray")
faces.images.shape
(1348, 62, 47)
faces.data.shape
(1348, 2914)
faces.target
array([1, 3, 3, ..., 7, 3, 5], dtype=int64)
faces.data.shape
(1348, 2914)
# 降维
pca=PCA(150).fit(faces.data)
v=pca.components_
v.shape
(150, 2914)
fig,axes=plt.subplots(3,8,figsize=(8,4),subplot_kw={"xticks":[],"yticks":[]})
for i,ax in enumerate(axes.flat):
ax.imshow(v[i,:].reshape(62,47),cmap="gray")
数字识别
data=pd.read_csv("digit recognizor.csv")
data.shape
(42000, 785)
x=data.iloc[:,1:]
y=data.iloc[:,0]
x.shape
(42000, 784)
data.head()
label | pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | ... | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 785 columns
2 画累计方差贡献率,找最佳降维后维度的范围
pca_line=PCA().fit(x)
plt.figure(figsize=(20,5))
<Figure size 1440x360 with 0 Axes>
<Figure size 1440x360 with 0 Axes>
plt.plot(np.cumsum(pca_line.explained_variance_ratio_))
plt.xlabel("number of components after dimension reduction")
plt.ylabel("cumulative explained variance ratio")
plt.show()
x
pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | ... | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 |
---|
42000 rows × 784 columns
# 找出大致范围,继续缩小最佳维度范围
score=[]
for i in range(1,101,10):
x_dr = PCA(i).fit_transform(x)
once=cross_val_score(RFC(n_estimators=10,random_state=0)
,x_dr,y,cv=10).mean()
score.append(once)
plt.figure(figsize=(20,5))
plt.show()
<Figure size 1440x360 with 0 Axes>
# 细化学习曲线,找出降维后的最佳维度
score=[]
for i in range(10,25):
x_dr = PCA(i).fit_transform(x)
once=cross_val_score(RFC(n_estimators=10,random_state=0)
,x_dr,y,cv=10).mean()
score.append(once)
plt.figure(figsize=(20,5))
plt.plot(range(10,25),score)
plt.show()
# 找出最佳维度进行降维,查看模型效果
x_dr = PCA(23).fit_transform(x)
cross_val_score(RFC(n_estimators=100,random_state=0),x_dr,y,cv=5).mean()
0.9461904761904762