欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

机器学习实战Chp13: 利用PCA简化数据

程序员文章站 2022-07-16 18:05:25
...
  • 主要参考周志华的《机器学习》,P230

  • PCA核心程序
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 29 21:42:05 2018

@author: muli
"""

from numpy import *


#mean(a, axis, dtype, out,keepdims )函数
#    功能:求取均值;
#    设a为m * n矩阵举例:
#    axis 不设置值,对 m*n 个数求均值,返回一个实数
#    axis = 0:压缩行,对各列求均值,返回 1* n 矩阵
#    axis =1 :压缩列,对各行求均值,返回 m *1 矩阵


# 数据格式处理
def loadDataSet(fileName, delim='\t'):
    fr = open(fileName)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]
    datArr = [map(float,line) for line in stringArr]
    return mat(datArr)


# 注意:代码中的X和W 与 周志华书上P230的X和W为转置关系
def pca(dataMat, topNfeat=9999999):
    # 求取均值
    meanVals = mean(dataMat, axis=0)
    # 归一化
    meanRemoved = dataMat - meanVals #remove mean
    # 求方差
    covMat = cov(meanRemoved, rowvar=0)
    # 特征值、特征向量
    eigVals,eigVects = linalg.eig(mat(covMat))
    # 特征值排序,从小到大排序
    eigValInd = argsort(eigVals)            #sort, sort goes smallest to largest
    # 取前topNfeat的特征值
    eigValInd = eigValInd[:-(topNfeat+1):-1]  #cut off unwanted dimensions
    # 由特征值取特征向量,构成一个特征向量的矩阵
    redEigVects = eigVects[:,eigValInd]       #reorganize eig vects largest to smallest
    # 映射到一个新的空间
    lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions
    # 重构后的x值
    reconMat = (lowDDataMat * redEigVects.T) + meanVals
    # 返回 Z 和~x 
    return lowDDataMat, reconMat


# 测试模块
if __name__ == "__main__" :
    dataMat = loadDataSet('testSet.txt')
    lowDMat,reconMat = pca(dataMat,1)
    print(shape(lowDMat))
  • 绘图

  • PCA降维前

# -*- coding: utf-8 -*-
"""
Created on Mon Jul 30 20:03:32 2018

@author: muli
"""

from numpy import *
import matplotlib
import matplotlib.pyplot as plt


n = 1000 #number of points to create
xcord0 = []
ycord0 = []
xcord1 = []
ycord1 = []
markers =[]
colors =[]
fw = open('testSet.txt','w')
for i in range(n):
    [r0,r1] = random.standard_normal(2)
    fFlyer = r0 + 9.0
    tats = 1.0*r1 + fFlyer + 0
    xcord0.append(fFlyer)
    ycord0.append(tats)
    fw.write("%f\t%f\n" % (fFlyer, tats))

fw.close()
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord0,ycord0, marker='^', s=90)
plt.xlabel('hours of direct sunlight')
plt.ylabel('liters of water')
plt.show()
  • PCA降维后
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 30 20:04:20 2018

@author: muli
"""

from numpy import *
import matplotlib
import matplotlib.pyplot as plt
import PCA_test


dataMat = PCA_test.loadDataSet('testSet.txt')
lowDMat, reconMat = PCA_test.pca(dataMat, 1)

#print(dataMat[:,1])
#print(dataMat[:,1].shape)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0], marker='^', s=90)
ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0], marker='o', s=50, c='red')
plt.show()

机器学习实战Chp13: 利用PCA简化数据