机器学习之决策树

程序员文章站 2022-03-30 23:02:33

...

机器学习之决策树

"""决策树"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier

iris=datasets.load_iris()
X=iris.data[:,2:]
y=iris.target

plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()

dt_clf=DecisionTreeClassifier(max_depth=2,criterion='entropy')
dt_clf.fit(X,y)

def plot_decision_boundary(model,axis):
    x0,x1 = np.meshgrid(
        np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)),
        np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100))
    )
    X_new = np.c_[x0.ravel(),x1.ravel()]
    y_predict = model.predict(X_new)
    zz = y_predict.reshape(x0.shape)
    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
    plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
plot_decision_boundary(dt_clf,axis=[0.5,7.5,-1.0,3])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()

机器学习之决策树

"""模拟使用信息熵进行划分"""
def split(X,y,d,value):
    index_a=(X[:,d]<=value)
    index_b=(X[:,d]>value)
    return X[index_a],X[index_b],y[index_a],y[index_b]

from math import log
from collections import Counter
def entropy(y):
    counter=Counter(y)
    res=0.0
    for num in counter.values():
        p=num/len(y)
        res+=-p*log(p)
    return res

def try_split(X,y):
    best_entropy=float('inf')
    best_d,best_v=-1,-1
    for d in range(X.shape[1]):
        sorted_index=np.argsort(X[:,d])
        for i in range(1,len(X)):
            if X[sorted_index[i-1],d]!=X[sorted_index[i],d]:
                v=(X[sorted_index[i-1],d]+X[sorted_index[i],d])/2
                X_l,X_r,y_l,y_r=split(X,y,d,v)
                e=entropy(y_l)+entropy(y_r)
                if e<best_entropy:
                    best_entropy,best_d,best_v=e,d,v
    return best_entropy,best_d,best_v


best_entropy,best_d,best_v=try_split(X,y)
print('best_entropy=',best_entropy)
print('best_d=',best_d)
print('best_v=',best_v)

X1_l, X1_r, y1_l, y1_r = split(X, y, best_d, best_v)
print(entropy(y1_l))
print(entropy(y1_r))

best_entropy2,best_d2,best_v2=try_split(X1_r,y1_r)
print('best_entropy=',best_entropy2)
print('best_d=',best_d2)
print('best_v=',best_v2)

X2_l, X2_r, y2_l, y2_r = split(X1_r, y1_r, best_d2, best_v2)
print(entropy(y2_l))
print(entropy(y2_r))

结果：

E:\pythonspace\KNN_function\venv\Scripts\python.exe E:/pythonspace/KNN_function/try.py
E:\pythonspace\KNN_function\venv\lib\site-packages\matplotlib\contour.py:960: UserWarning: The following kwargs were not used by contour: 'linewidth'
  s)
best_entropy= 0.6931471805599453
best_d= 0
best_v= 2.45
0.0
0.6931471805599453
best_entropy= 0.4132278899361904
best_d= 1
best_v= 1.75
0.30849545083110386
0.10473243910508653

Process finished with exit code 0

机器学习之决策树

参照上图:最好的维度是第0维，左边信息熵最小为0，右边信息熵为0.69，说明右面可以继续划分。

第二次划分,最好的维度是第一维，左边信息熵是0.30，右面的信息熵为0.1。

使用基尼系数

机器学习之决策树

"""基尼系数"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier

iris=datasets.load_iris()
X=iris.data[:,2:]
y=iris.target

dt_clf=DecisionTreeClassifier(max_depth=2,criterion='gini')
dt_clf.fit(X,y)

def plot_decision_boundary(model,axis):
    x0,x1 = np.meshgrid(
        np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)),
        np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100))
    )
    X_new = np.c_[x0.ravel(),x1.ravel()]
    y_predict = model.predict(X_new)
    zz = y_predict.reshape(x0.shape)
    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
    plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
plot_decision_boundary(dt_clf,axis=[0.5,7.5,-1.0,3])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()

"""模拟使用基尼系数进行划分"""
def split(X,y,d,value):
    index_a=(X[:,d]<=value)
    index_b=(X[:,d]>value)
    return X[index_a],X[index_b],y[index_a],y[index_b]

from math import log
from collections import Counter
def jini(y):
    counter=Counter(y)
    res=1.0
    for num in counter.values():
        p=num/len(y)
        res-=p**2
    return res

def try_split(X,y):
    best_g=float('inf')
    best_d,best_v=-1,-1
    for d in range(X.shape[1]):
        sorted_index=np.argsort(X[:,d])
        for i in range(1,len(X)):
            if X[sorted_index[i-1],d]!=X[sorted_index[i],d]:
                v=(X[sorted_index[i-1],d]+X[sorted_index[i],d])/2
                X_l,X_r,y_l,y_r=split(X,y,d,v)
                g=jini(y_l)+jini(y_r)
                if g<best_g:
                    best_g,best_d,best_v=g,d,v
    return best_g,best_d,best_v


best_g,best_d,best_v=try_split(X,y)
print('best_g=',best_g)
print('best_d=',best_d)
print('best_v=',best_v)

X1_l, X1_r, y1_l, y1_r = split(X, y, best_d, best_v)
print(jini(y1_l))
print(jini(y1_r))

best_g2,best_d2,best_v2=try_split(X1_r,y1_r)
print('best_g=',best_g2)
print('best_d=',best_d2)
print('best_v=',best_v2)

X2_l, X2_r, y2_l, y2_r = split(X1_r, y1_r, best_d2, best_v2)
print(jini(y2_l))
print(jini(y2_r))

结果：

E:\pythonspace\KNN_function\venv\Scripts\python.exe E:/pythonspace/KNN_function/try.py
E:\pythonspace\KNN_function\venv\lib\site-packages\matplotlib\contour.py:960: UserWarning: The following kwargs were not used by contour: 'linewidth'
  s)
best_g= 0.5
best_d= 0
best_v= 2.45
0.0
0.5
best_g= 0.2105714900645938
best_d= 1
best_v= 1.75
0.1680384087791495
0.04253308128544431

Process finished with exit code 0

基尼系数和信息熵原理大致相同。

机器学习之决策树

决策树解决回归问题


"""决策树解决回归问题"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split

boston=datasets.load_boston()
X=boston.data
y=boston.target
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=666)

from sklearn.tree import DecisionTreeRegressor
dt_reg=DecisionTreeRegressor()
dt_reg.fit(X_train,y_train)
print(dt_reg.score(X_test,y_test))

结果：
出现过拟合。

E:\pythonspace\KNN_function\venv\Scripts\python.exe E:/pythonspace/KNN_function/try.py
0.68499290930685

Process finished with exit code 0

上一篇： Linux 特殊权限

下一篇：湖南理工类大学排名及分数线汇总2022高考参考

机器学习之决策树

使用基尼系数

决策树解决回归问题

实例学习PHP之留言程序

php字符串函数学习之substr()，字符串substr_PHP教程

[walkerlee原作]对PHP之函数sprintf()的学习研究笔记_PHP

mysql学习之基础篇02

Symfony2框架学习笔记之表单用法详解，symfony2学习笔记_PHP教程

mysql学习之基础篇01

Python机器学习之基础概述

PHP学习之输出字符串(echo,print,printf,print_r和var_dump)_php基础

Yii学习总结之数据访问对象 (DAO)_php实例

scikit实现机器学习常用模型