机器学习之决策树
程序员文章站
2022-03-30 23:02:33
...
"""决策树"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
iris=datasets.load_iris()
X=iris.data[:,2:]
y=iris.target
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
dt_clf=DecisionTreeClassifier(max_depth=2,criterion='entropy')
dt_clf.fit(X,y)
def plot_decision_boundary(model,axis):
x0,x1 = np.meshgrid(
np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)),
np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100))
)
X_new = np.c_[x0.ravel(),x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
plot_decision_boundary(dt_clf,axis=[0.5,7.5,-1.0,3])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
"""模拟使用信息熵进行划分"""
def split(X,y,d,value):
index_a=(X[:,d]<=value)
index_b=(X[:,d]>value)
return X[index_a],X[index_b],y[index_a],y[index_b]
from math import log
from collections import Counter
def entropy(y):
counter=Counter(y)
res=0.0
for num in counter.values():
p=num/len(y)
res+=-p*log(p)
return res
def try_split(X,y):
best_entropy=float('inf')
best_d,best_v=-1,-1
for d in range(X.shape[1]):
sorted_index=np.argsort(X[:,d])
for i in range(1,len(X)):
if X[sorted_index[i-1],d]!=X[sorted_index[i],d]:
v=(X[sorted_index[i-1],d]+X[sorted_index[i],d])/2
X_l,X_r,y_l,y_r=split(X,y,d,v)
e=entropy(y_l)+entropy(y_r)
if e<best_entropy:
best_entropy,best_d,best_v=e,d,v
return best_entropy,best_d,best_v
best_entropy,best_d,best_v=try_split(X,y)
print('best_entropy=',best_entropy)
print('best_d=',best_d)
print('best_v=',best_v)
X1_l, X1_r, y1_l, y1_r = split(X, y, best_d, best_v)
print(entropy(y1_l))
print(entropy(y1_r))
best_entropy2,best_d2,best_v2=try_split(X1_r,y1_r)
print('best_entropy=',best_entropy2)
print('best_d=',best_d2)
print('best_v=',best_v2)
X2_l, X2_r, y2_l, y2_r = split(X1_r, y1_r, best_d2, best_v2)
print(entropy(y2_l))
print(entropy(y2_r))
结果:
E:\pythonspace\KNN_function\venv\Scripts\python.exe E:/pythonspace/KNN_function/try.py
E:\pythonspace\KNN_function\venv\lib\site-packages\matplotlib\contour.py:960: UserWarning: The following kwargs were not used by contour: 'linewidth'
s)
best_entropy= 0.6931471805599453
best_d= 0
best_v= 2.45
0.0
0.6931471805599453
best_entropy= 0.4132278899361904
best_d= 1
best_v= 1.75
0.30849545083110386
0.10473243910508653
Process finished with exit code 0
参照上图:最好的维度是第0维,左边信息熵最小为0,右边信息熵为0.69,说明右面可以继续划分。
第二次划分,最好的维度是第一维,左边信息熵是0.30,右面的信息熵为0.1。
使用基尼系数
"""基尼系数"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
iris=datasets.load_iris()
X=iris.data[:,2:]
y=iris.target
dt_clf=DecisionTreeClassifier(max_depth=2,criterion='gini')
dt_clf.fit(X,y)
def plot_decision_boundary(model,axis):
x0,x1 = np.meshgrid(
np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)),
np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100))
)
X_new = np.c_[x0.ravel(),x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
plot_decision_boundary(dt_clf,axis=[0.5,7.5,-1.0,3])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
"""模拟使用基尼系数进行划分"""
def split(X,y,d,value):
index_a=(X[:,d]<=value)
index_b=(X[:,d]>value)
return X[index_a],X[index_b],y[index_a],y[index_b]
from math import log
from collections import Counter
def jini(y):
counter=Counter(y)
res=1.0
for num in counter.values():
p=num/len(y)
res-=p**2
return res
def try_split(X,y):
best_g=float('inf')
best_d,best_v=-1,-1
for d in range(X.shape[1]):
sorted_index=np.argsort(X[:,d])
for i in range(1,len(X)):
if X[sorted_index[i-1],d]!=X[sorted_index[i],d]:
v=(X[sorted_index[i-1],d]+X[sorted_index[i],d])/2
X_l,X_r,y_l,y_r=split(X,y,d,v)
g=jini(y_l)+jini(y_r)
if g<best_g:
best_g,best_d,best_v=g,d,v
return best_g,best_d,best_v
best_g,best_d,best_v=try_split(X,y)
print('best_g=',best_g)
print('best_d=',best_d)
print('best_v=',best_v)
X1_l, X1_r, y1_l, y1_r = split(X, y, best_d, best_v)
print(jini(y1_l))
print(jini(y1_r))
best_g2,best_d2,best_v2=try_split(X1_r,y1_r)
print('best_g=',best_g2)
print('best_d=',best_d2)
print('best_v=',best_v2)
X2_l, X2_r, y2_l, y2_r = split(X1_r, y1_r, best_d2, best_v2)
print(jini(y2_l))
print(jini(y2_r))
结果:
E:\pythonspace\KNN_function\venv\Scripts\python.exe E:/pythonspace/KNN_function/try.py
E:\pythonspace\KNN_function\venv\lib\site-packages\matplotlib\contour.py:960: UserWarning: The following kwargs were not used by contour: 'linewidth'
s)
best_g= 0.5
best_d= 0
best_v= 2.45
0.0
0.5
best_g= 0.2105714900645938
best_d= 1
best_v= 1.75
0.1680384087791495
0.04253308128544431
Process finished with exit code 0
基尼系数和信息熵原理大致相同。
决策树解决回归问题
"""决策树解决回归问题"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
boston=datasets.load_boston()
X=boston.data
y=boston.target
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=666)
from sklearn.tree import DecisionTreeRegressor
dt_reg=DecisionTreeRegressor()
dt_reg.fit(X_train,y_train)
print(dt_reg.score(X_test,y_test))
结果:
出现过拟合。
E:\pythonspace\KNN_function\venv\Scripts\python.exe E:/pythonspace/KNN_function/try.py
0.68499290930685
Process finished with exit code 0
上一篇: Linux 特殊权限