欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

决策树 python手写 不用封装好的库

程序员文章站 2024-01-17 18:20:34
...

决策树 id3 学校作业,数据集不多,没划分测试训练,结果看最后

数据集及描述

import numpy as np
import pandas as pd
import numpy.random
import time
import math
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy.io import arff
filepath='caesarian.csv.arff'   
data = arff.loadarff(filepath)
df = pd.DataFrame(data[0],dtype='int')
df
C:\Users\20535\Anaconda3\envs\tensorflow\lib\site-packages\numpy\core\numeric.py:2378: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  return bool(asarray(a1 == a2).all())
Age Delivery number Delivery time Blood of Pressure Heart Problem Caesarian
0 22 1 0 2 0 0
1 26 2 0 1 0 1
2 26 2 1 1 0 0
3 28 1 0 2 0 0
4 22 2 0 1 0 1
... ... ... ... ... ... ...
75 27 2 1 1 0 0
76 33 4 0 1 0 1
77 29 2 1 2 0 1
78 25 1 2 0 0 1
79 24 2 2 1 0 0

80 rows × 6 columns

grid = sns.FacetGrid(df, size=2.2, aspect=1.6)
grid.map(sns.lineplot, 'Age', 'Caesarian', palette='deep')
grid.add_legend()
C:\Users\20535\Anaconda3\envs\tensorflow\lib\site-packages\seaborn\axisgrid.py:316: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)





<seaborn.axisgrid.FacetGrid at 0x21bf4553c08>

决策树 python手写 不用封装好的库

sns.countplot(x = 'Age', hue = "Caesarian", data = df)
plt.show()

决策树 python手写 不用封装好的库

df['Age'].max()-df['Age'].min()
23
df['Age'].max()
40
df['Age'].min()
17
df['Age'].loc[df['Age']>=32]=3
df['Age'].loc[df['Age']>=23]=2
df['Age'].loc[df['Age']>16]=1
grid = sns.FacetGrid(df, size=2.2, aspect=1.6)
grid.map(sns.lineplot, 'Age', 'Caesarian', palette='deep')
grid.add_legend()
C:\Users\20535\Anaconda3\envs\tensorflow\lib\site-packages\seaborn\axisgrid.py:316: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)





<seaborn.axisgrid.FacetGrid at 0x21bf482ce88>

决策树 python手写 不用封装好的库

年龄改成分层,数据量太少,年龄种类太多,分层能保证每个末端能多一点

Mdata=df.values
x=Mdata[:,0:5]
Y=Mdata[:,5:6]
len_of_feature_count = []
for i in df.columns.tolist():
    print(i, ':', len(df[i].astype(str).value_counts()))
    len_of_feature_count.append(len(df[i].astype(str).value_counts()))
print("over")
Age : 3
Delivery number : 4
Delivery time : 3
Blood of Pressure : 3
Heart Problem : 2
Caesarian : 2
over
df
Age Delivery number Delivery time Blood of Pressure Heart Problem Caesarian
0 1 1 0 2 0 0
1 2 2 0 1 0 1
2 2 2 1 1 0 0
3 2 1 0 2 0 0
4 1 2 0 1 0 1
... ... ... ... ... ... ...
75 2 2 1 1 0 0
76 3 4 0 1 0 1
77 2 2 1 2 0 1
78 2 1 2 0 0 1
79 2 2 2 1 0 0

80 rows × 6 columns

def hd(y):
    ty=(y==1).sum()
    fy=(y==0).sum()
    ay=len(y)
    a1=ty/ay
    a2=fy/ay
    if a1 != 0 and a2 !=0 and ay != 0:
        res=-a1*math.log(a1,2)-a2*math.log(a2,2)
    elif a2 !=0 and ay != 0:
        res=-a2*math.log(a2,2)
    elif a1 !=0 and ay != 0:
        res=-a1*math.log(a1,2)
    else:
        res=0
    return res
Uncertainty=hd(Y)
Uncertainty
0.9837082626231857
def gda(i,mdf):
    nl=len_of_feature_count[i]
    name=mdf.columns.tolist()[i]
    UTS=[]
    for t in list(set(mdf[name].values)):
        temp=mdf[mdf[name]==t]
        temp2=temp.values
        temp3=temp[temp['Caesarian']==1]
        temp_num=len(temp3)/len(Y)
        uncertainty=hd(temp2[:,-1])#小写  
        UTS.append(uncertainty*temp_num)
    
    return Uncertainty-sum(UTS)
        
RES=[]
for nn in range(5):
    res=gda(nn,df)
    RES.append(res)
    print(df.columns.tolist()[nn]+':'+str(res))
Age:0.4696775060793045
Delivery number:0.44683156475569896
Delivery time:0.4346659154931095
Blood of Pressure:0.46832970111430006
Heart Problem:0.49499326582091163
df.columns.tolist()[RES.index(max(RES))]
'Heart Problem'

从这里开始,上面都是些预处理和函数测试

in_put=df
Mytree={'name':'MDS','name':{}}

Count=0
def tree(temp_data,last_tree,last_name):
    global Count
    
    #if len(temp_data.columns.tolist())==1:  原版
    if len(temp_data.columns.tolist())==1 or hd(temp_data.values[:,-1])<0.3:
        Count+=1
        if temp_data['Caesarian'].mean()>=0.5:
            last_tree[last_name] ='true'
        else:
            last_tree[last_name] ='false'
        return
    RES=[]
    for nn in range(len(temp_data.columns.tolist())-1):
        res=gda(nn,df)
        #print(temp_data.columns.tolist()[nn]+':'+str(res))
        RES.append(res)
    Best_name=temp_data.columns.tolist()[RES.index(max(RES))]
    temp=list(set(temp_data[Best_name].values))
    temp.append('name')
    temp.append('last')
    mytree=dict.fromkeys(temp,0)
    mytree['name'] =Best_name
    mytree['last'] =last_name
    last_tree[last_name] =mytree
    for nnn in list(set(df[Best_name].values)):
        kid_data1=temp_data.loc[temp_data[Best_name]==nnn]
        kid_data=kid_data1.drop([Best_name],axis=1)
        tree(kid_data,mytree,nnn)
tree(df,Mytree,'name')
C:\Users\20535\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel_launcher.py:5: RuntimeWarning: invalid value encountered in long_scalars
  """
C:\Users\20535\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel_launcher.py:6: RuntimeWarning: invalid value encountered in long_scalars
Count#叶子节点数量
72
Mytree
{'name': {0: {1: {1: {0: {0: 'true',
      1: 'true',
      2: 'false',
      'name': 'Blood of Pressure',
      'last': 0},
     2: {2: 'true',
      'name': 'Blood of Pressure',
      'last': 2,
      0: 'false',
      1: 'false'},
     'name': 'Delivery time',
     'last': 1,
     1: 'false'},
    2: 'true',
    'name': 'Delivery number',
    'last': 1,
    3: 'false',
    4: 'false'},
   2: {1: {0: {0: 'true',
      1: 'false',
      2: 'true',
      'name': 'Blood of Pressure',
      'last': 0},
     1: {0: 'false',
      1: 'false',
      2: 'false',
      'name': 'Blood of Pressure',
      'last': 1},
     2: {0: 'true',
      'name': 'Blood of Pressure',
      'last': 2,
      1: 'false',
      2: 'false'},
     'name': 'Delivery time',
     'last': 1},
    2: {0: {1: 'false',
      'name': 'Blood of Pressure',
      'last': 0,
      0: 'false',
      2: 'false'},
     1: {1: 'false',
      2: 'true',
      'name': 'Blood of Pressure',
      'last': 1,
      0: 'false'},
     2: {0: 'false',
      1: 'false',
      'name': 'Blood of Pressure',
      'last': 2,
      2: 'false'},
     'name': 'Delivery time',
     'last': 2},
    3: {0: 'true', 2: 'false', 'name': 'Delivery time', 'last': 3, 1: 'false'},
    'name': 'Delivery number',
    'last': 2,
    4: 'false'},
   3: {1: {0: 'false',
     1: {0: 'true',
      1: 'false',
      2: 'true',
      'name': 'Blood of Pressure',
      'last': 1},
     'name': 'Delivery time',
     'last': 1,
     2: 'false'},
    2: 'true',
    3: 'true',
    4: 'true',
    'name': 'Delivery number',
    'last': 3},
   'name': 'Age',
   'last': 0},
  1: {1: {1: {0: {1: 'false',
      2: 'true',
      'name': 'Blood of Pressure',
      'last': 0,
      0: 'false'},
     1: 'true',
     'name': 'Delivery time',
     'last': 1,
     2: 'false'},
    2: 'true',
    'name': 'Delivery number',
    'last': 1,
    3: 'false',
    4: 'false'},
   2: {1: {0: 'true',
     2: {0: 'true',
      2: 'false',
      'name': 'Blood of Pressure',
      'last': 2,
      1: 'false'},
     'name': 'Delivery time',
     'last': 1,
     1: 'false'},
    2: {0: {0: 'true',
      1: 'true',
      'name': 'Blood of Pressure',
      'last': 0,
      2: 'false'},
     1: {1: 'false',
      2: 'true',
      'name': 'Blood of Pressure',
      'last': 1,
      0: 'false'},
     'name': 'Delivery time',
     'last': 2,
     2: 'false'},
    3: 'true',
    'name': 'Delivery number',
    'last': 2,
    4: 'false'},
   3: {1: 'true',
    2: 'true',
    3: {0: {1: 'true',
      'name': 'Blood of Pressure',
      'last': 0,
      0: 'false',
      2: 'false'},
     1: 'true',
     2: {1: 'false',
      2: 'true',
      'name': 'Blood of Pressure',
      'last': 2,
      0: 'false'},
     'name': 'Delivery time',
     'last': 3},
    4: 'true',
    'name': 'Delivery number',
    'last': 3},
   'name': 'Age',
   'last': 1},
  'name': 'Heart Problem',
  'last': 'name'}}
user1=Mytree['name']
cx=user1['name']
Iage=2
Idn=2
Idt=0
Ibp=0
Ihp=1
cx1=user1['name']
cx1
'Heart Problem'
user2=user1[Ihp]
cx2=user2['name']
cx2
'Age'
user3=user2[Iage]
cx3=user3['name']
cx3
'Delivery number'
user4=user3[Idn]
cx4=user4['name']
cx4
'Delivery time'
user5=user4[Idt]
cx5=user5['name']
cx5
'Blood of Pressure'

user6=user5[Ibp]
user6
'true'
label={'Age':0,'Delivery number':1,'Delivery time':2,'Blood of Pressure':3,'Heart Problem':4}
Input=[2,2,0,0,1]
user=Mytree['name']

def Result(input,user):
    while True:
        cx=user['name']
        user=user[input[label[cx]]]
        if user==('true'):
            return 1
        elif user==('false'):
            return 0
        else:
            cx=user['name']
Result(Input,Mytree['name'])
1
test=df.values
xtest=test[:,0:5]
ytest=test[:,5:6]

score=0
for i in range(len(test)):
    xin=xtest[i]
    yin=ytest[i]
    a=(Result(xin,Mytree['name']))
    b=(yin[0])
    if a==b:
        score+=1
print(score/len(test))
0.8625