基于随机森林的化合物活性二分类模型
程序员文章站
2022-07-14 15:19:29
...
基于随机森林算法的化合物二分类机器学习模型
代码示例
#导入依赖包
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from rdkit.Chem.EState import Fingerprinter
from rdkit.Chem import PandasTools
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn import model_selection
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
#定义化合物指纹计算函数
def get_fps(mol):
# 计算指纹 (clogP, PSA, etc etc)
calc=MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
ds = np.asarray(calc.CalcDescriptors(mol))
arr=Fingerprinter.FingerprintMol(mol)[0]
return np.append(arr,ds)
# 载入数据.
df=pd.read_csv('mol_IC50.csv',usecols=[0,1,4])
#随机打乱数据
df = shuffle(df)
#查看数据
df.head()
# 将化合物加入数据框
PandasTools.AddMoleculeColumnToFrame(df,'mol','Molecule')
#查看数据
df.head()
# 计算描述符和指纹添加至数据框
df['Descriptors']=df['Molecule'].apply(get_fps)
# 添加标签, pIC50>6标记为活性分子 (Active = 1)
df['Active']=np.where(df['pIC50']>6, 1, 0)
# 将描述符和活性数据转化为数组
X = np.array(list(df['Descriptors']))
y = df['Active'].values
# 划分训练集和测试集
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)
# 创建随机森林模型并拟合数据
rf = RandomForestClassifier(max_features='auto')
rf.fit(X_train, y_train)
# 在测试集上进行预测
y_pred = rf.predict(X_test)
#进行ROC统计
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
#绘制ROC曲线
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig("ROC.jpg", dpi = 300)
plt.show()