欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

scikitlearn之 LDA,QDA

程序员文章站 2022-07-14 21:37:40
...

1.2. Linear and Quadratic Discriminant Analysis

Linear and Quadratic Discriminant Analysis with covariance ellipsoid

from scipy import linalg
import  numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import  colors

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
cmap = colors.LinearSegmentedColormap('red_blue_classes',
{'red':[(0,1,1),
    (1,0.7,0.7)],
'green':[(0,0.7,0.7),
         (1,0.7,0.7)],
 'blue':[(0,0.7,0.7),
         (1,1,1)]
})
plt.cm.register_cmap(cmap=cmap)
aa = [(0, 1, 1), (1, 0.7, 0.7)]#一个列表
aa1= [[0, 1, 1], [1, 0.7, 0.7]]#也是一个列表
bb = np.array([[0, 1, 1], [1, 0.7, 0.7]])#一个数组
cc = np.mat([[0, 1, 1], [1, 0.7, 0.7]])#一个矩阵
cc2 = cc*2
np.dot(cc,cc2.transpose())==cc*cc2.transpose()#都是矩阵相乘
def dataset_fix_cov():
    #生成两个协方差矩阵相同的数据集
    n,dim = 300,2
    np.random.seed(0)
    C = np.array([[0,-0.23],[0.83,0.23]])
    X = np.r_[np.dot(np.random.randn(n,dim),C),
              np.dot(np.random.randn(n, dim), C)+np.array([1,1])]
    #np.dot 数组小于等于二维的时候执行矩阵的乘法,等价于*
    Y = np.hstack((np.zeros(n),np.ones(n)))
    return X,Y

def dataset_cov():
    #生成两个个方差不相等的数据集
    n, dim = 300, 2
    np.random.seed(0)
    C = np.array([[0., -1.], [2.5, .7]]) * 2.#这就是设置方差的
    X = np.r_[np.dot(np.random.randn(n, dim), C),
              np.dot(np.random.randn(n, dim), C.T) + np.array([1, 4])]
    Y = np.hstack((np.zeros(n), np.ones(n)))
    return X, Y

def plot_data(lda,X,Y,Y_pred,fig_index):
    splot = plt.subplot(2,2,fig_index)
    if fig_index==1:
        plt.title('Linear Discriminant Analysis')
        plt.ylabel('Data with\n fixed covariance')
    elif fig_index==2:
        plt.title('Quadratic Discriminant Analysis')
    elif fig_index==3:
        plt.ylabel('Data with\n varying covariance')
    tp = (Y==Y_pred)#找到预测成功的样本点
    tp0,tp1 = tp[Y==0],tp[Y==1]#将预测成功的样本点根据y分类,得到的是索引
    X0,X1= X[Y==0],X[Y==1]#将所有样本点按照y分类
    X0_tp,X0_fp = X0[tp0],X0[~tp0]
    #得到预测成功的y=0样本点的X值和预测失败的样本点的X值
    X1_tp,X1_fp = X1[tp1],X1[~tp1]

    alpha = 0.5
    plt.plot(X0_tp[:,0],X0_tp[:,1],'o',alpha=alpha,color='red',
             markeredgecolor='k')#y=0预测成功的点
    plt.plot(X0_fp[:,0],X0_fp[:,1],'*',alpha=alpha,color='#990000',
             markeredgecolor = 'k')#y=0预测失败的点

    plt.plot(X1_tp[:,0],X1_tp[:,1],'o',alpha=alpha,color='blue',
             markeredgecolor='k')#y=1预测成功的点
    plt.plot(X1_fp[:,0],X1_fp[:,1],'*',alpha=alpha,color='#000099',
             markeredgecolor = 'k')#y=1预测失败的点
    nx,ny = 200,100
    x_min,x_max = plt.xlim()
    y_min,y_max = plt.ylim()
    xx,yy = np.meshgrid(np.linspace(x_min,x_max,nx),
                        np.linspace(y_min,y_max,ny))
    z = lda.predict_proba(np.c_[xx.ravel(),yy.ravel()])
    z = z[:,1].reshape(xx.shape)
    plt.pcolormesh(xx,yy,z,cmap='red_blue_classes',
                   norm=colors.Normalize(0,1))
    plt.contour(xx,yy,z,[0.5],linewidths = 2,colors='k')
    plt.plot(lda.means_[0][0], lda.means_[0][1],
             'o', color='black', markersize=10, markeredgecolor='k')
    plt.plot(lda.means_[1][0], lda.means_[1][1],
             'o', color='black', markersize=10, markeredgecolor='k')

    return  splot

def plot_ellipse(splot, mean, cov, color):
    v, w = linalg.eigh(cov)
    u = w[0] / linalg.norm(w[0])
    angle = np.arctan(u[1] / u[0])
    angle = 180 * angle / np.pi  # convert to degrees
    # filled Gaussian at 2 standard deviation
    ell = mpl.patches.Ellipse(mean, 2 * v[0] ** 0.5, 2 * v[1] ** 0.5,
                              180 + angle, facecolor=color,
                              edgecolor='yellow',
                              linewidth=2, zorder=2)
    ell.set_clip_box(splot.bbox)
    ell.set_alpha(0.5)
    splot.add_artist(ell)
    splot.set_xticks(())
    splot.set_yticks(())

def plot_lda_cov(lda, splot):
    plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red')
    plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')


def plot_qda_cov(qda, splot):
    plot_ellipse(splot, qda.means_[0], qda.covariance_[0], 'red')
    plot_ellipse(splot, qda.means_[1], qda.covariance_[1], 'blue')
for i,(X,Y) in enumerate([dataset_fix_cov(),dataset_cov()]):
    #1,固定方差的样本;2,不同方差的样本
    lda = LinearDiscriminantAnalysis(solver='svd',store_covariance=True)
    Y_pred = lda.fit(X,Y).predict(X)
    splot = plot_data(lda,X,Y,Y_pred,fig_index=2*i+1)
    plot_lda_cov(lda,splot)
    plt.axis('tight')

    qda = QuadraticDiscriminantAnalysis(store_covariance=True)
    Y_pred = qda.fit(X,Y).predict(X)
    splot = plot_data(qda,X,Y,Y_pred,fig_index=2*i+2)
    plot_qda_cov(qda,splot)
    plt.axis('tight')
plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant'
             'Analysis')
plt.show()

scikitlearn之 LDA,QDA

Comparison of LDA and PCA 2D projection of Iris dataset

import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
iris = datasets.load_iris()
x = iris.data
y = iris.target
target_names = iris.target_names

pca = PCA(n_components=2)#两个主成分
x_r = pca.fit(x).transform(x)#PCA不需要y值,只用X的值
lda = LinearDiscriminantAnalysis(n_components=2)
x_r2 = lda.fit(x,y).transform(x)#注意LDA需要输入y值
print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))
plt.figure()
colors = ['navy', 'turquoise', 'darkorange']
lw = 2
for color,i,target_name in zip(colors,[0,1,2],target_names):
    plt.scatter(x_r[y==i,0],x_r[y==i,1],c = color,alpha=0.5,linewidths=lw,label = target_name)
plt.legend(loc = 'best',shadow = False,scatterpoints = 1)
plt.title('PCA of IRIS dataset')

for color,i,target_name in zip(colors,[0,1,2],target_names):
    plt.scatter(x_r2[y==i,0],x_r2[y==i,1],c = color,alpha=0.5,linewidths=lw,label = target_name)
plt.legend(loc = 'best',shadow = False,scatterpoints = 1)
#找到最好的图例位置,图例阴影为false,图例中点数量为1(具体这个参数我也不太清楚。。
#把1换成200以后,变成了一条线)
plt.title('LDA of IRIS dataset')

Normal and Shrinkage Linear Discriminant Analysis for classification

import  numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.discriminant_analysis import  LinearDiscriminantAnalysis

n_train = 20
n_test = 200
n_average = 50
n_features_max = 75
step = 4
n_samples = 11
def generate_data(n_samples,n_features):
    x,y = make_blobs(n_samples = n_samples,n_features=1,centers=[[-2],[2]])
    if n_features>1:
        x = np.hstack([x,np.random.randn(n_samples,n_features-1)])
    return  x,y
acc_clf1,acc_clf2 = [],[]
n_features_range = range(1,n_features_max,step)
for n_features in n_features_range:
    score_clf1,score_clf2 = 0,0
    for _ in range(n_average):
        x,y = generate_data(n_train,n_features)
        clf1 = LinearDiscriminantAnalysis(solver='lsqr',shrinkage='auto').fit(x,y)
        clf2 = LinearDiscriminantAnalysis(solver='lsqr',shrinkage=None).fit(x,y)
        x,y = generate_data(n_test,n_features)
        score_clf1 += clf1.score(x,y)
        score_clf2 += clf2.score(x,y)
    acc_clf1.append(score_clf1/n_average)
    acc_clf2.append(score_clf2/n_average)

feature_samples_ratio = np.array(n_features_range)/n_train
plt.plot(feature_samples_ratio, acc_clf1, linewidth=2,
         label="Linear Discriminant Analysis with shrinkage", color='navy')
plt.plot(feature_samples_ratio, acc_clf2, linewidth=2,
         label="Linear Discriminant Analysis", color='gold')

plt.xlabel('n_features / n_samples')
plt.ylabel('Classification accuracy')

plt.legend(loc=1, prop={'size': 12})
plt.suptitle('Linear Discriminant Analysis vs. \
shrinkage Linear Discriminant Analysis (1 discriminative feature)')
plt.show()