欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

青蒿素抗疟疾效果预测比赛

程序员文章站 2022-03-22 18:00:40
...

比赛网址及说明

https://www.synapse.org/#!Synapse:syn16924919/wiki/583955

青蒿素抗疟疾效果预测比赛

读取文件并简单分析

import pandas as pd
train_data = pd.read_csv('./SubCh1_TrainingData.csv')
test_data = pd.read_csv('./SubCh1_TestData.csv')
train_data.head()
Sample_Name Isolate Timepoint Treatment BioRep MAL1.465720.465873.bc..rv.. MAL1.48287.48430....kr...can MAL1.562126.562246.bc..r... MAL1.59416.59687....kr...can MAL1.616633.616682.......y. ... PF3D7_1479200 PF3D7_1479400 PF3D7_1479500 PF3D7_1479600 PF3D7_1479700 PF3D7_1479800 PF3D7_1479900 PF3D7_1480000 PF3D7_1480100 DHA_IC50
0 isolate_01.24HR.DHA.BRep1 isolate_01 24HR DHA BRep1 0.008286 -0.959492 0.206468 -0.443936 0.453030 ... -0.540993 -3.674097 -1.652979 -2.255490 -4.554757 -0.381422 -1.415857 -4.121011 -2.486528 2.177
1 isolate_01.24HR.DHA.BRep2 isolate_01 24HR DHA BRep2 -0.872028 -1.968558 -2.308563 -1.186611 0.508575 ... 0.450649 -4.464408 -0.977954 -2.012559 -4.538550 -2.333890 -2.342496 -4.774197 -1.794568 2.177
2 isolate_01.24HR.UT.BRep1 isolate_01 24HR UT BRep1 0.039480 -1.291371 -0.946661 -0.538092 -1.778501 ... 1.411557 -5.500522 -1.764229 -1.947208 -5.331037 -0.814417 -2.141285 -3.922684 -2.495169 2.177
3 isolate_01.24HR.UT.BRep2 isolate_01 24HR UT BRep2 0.125177 -1.462758 -1.888105 -0.828352 -1.410803 ... -0.659519 -5.011207 -0.559471 -2.266763 -5.289318 -2.949869 -2.206432 -4.324299 -1.735312 2.177
4 isolate_01.6HR.DHA.BRep1 isolate_01 6HR DHA BRep1 1.354956 -2.139128 -0.541696 -1.481881 -0.735798 ... -1.687048 -4.773306 -0.048847 -2.023727 -4.398358 -3.080500 -0.566545 -4.798190 -0.821690 2.177

5 rows × 5546 columns

test_data.head()
Sample_Names Isolate Timepoint Treatment BioRep MAL1.465720.465873.bc..rv.. MAL1.48287.48430....kr...can MAL1.562126.562246.bc..r... MAL1.59416.59687....kr...can MAL1.616633.616682.......y. ... PF3D7_1479200 PF3D7_1479400 PF3D7_1479500 PF3D7_1479600 PF3D7_1479700 PF3D7_1479800 PF3D7_1479900 PF3D7_1480000 PF3D7_1480100 DHA_IC50
0 isolate_31.24HR.DHA.BRep1 isolate_31 24HR DHA BRep1 0.416583 -2.304394 -1.279308 -1.128485 0.129698 ... 0.117763 -1.650283 -0.520373 -1.301428 -5.326124 -0.352736 -1.567931 -4.446439 -1.875440 NaN
1 isolate_31.24HR.DHA.BRep2 isolate_31 24HR DHA BRep2 0.247195 -2.176939 -1.470318 -0.676434 -0.206543 ... 0.472336 -1.151134 -0.342707 -1.001569 -5.337602 -0.672201 -1.494983 -4.299427 -1.553652 NaN
2 isolate_31.24HR.UT.BRep1 isolate_31 24HR UT BRep1 -0.197699 -0.572531 0.270896 0.557528 -0.698585 ... -0.309768 -6.330446 -0.850889 -1.603832 -3.256602 -2.517157 -0.650735 -4.841146 0.153951 NaN
3 isolate_31.24HR.UT.BRep2 isolate_31 24HR UT BRep2 0.966916 -1.723830 -1.102392 -0.797996 0.032847 ... -0.254186 -1.516661 -1.916219 -1.858716 -6.483167 -0.827476 -1.454423 -5.264615 -2.367444 NaN
4 isolate_31.6HR.DHA.BRep1 isolate_31 6HR DHA BRep1 0.125273 -1.665749 -2.510245 -1.454094 -1.763451 ... 0.653097 -2.383878 -2.731928 -2.557962 -5.697488 -3.466542 -2.147289 -4.358510 -1.929927 NaN

5 rows × 5546 columns

import seaborn as sns
import matplotlib.pyplot as plt 
# 利用kdeplot来确定两个sample data 是否来自于同一总体
f, (ax) = plt.subplots(1, 1, sharex=True, figsize=(8, 6))
c1, c2 = sns.color_palette('Set1', 2)
# dist1, dist2是两个近似正态数据, 拥有相同的中心和摆动程度
sns.kdeplot(train_data.DHA_IC50, shade=True, color=c1, label = 'DHA_IC50',ax=ax)

青蒿素抗疟疾效果预测比赛

from collections import Counter
Counter(train_data.Isolate)
Counter({'isolate_01': 8,
         'isolate_02': 16,
         'isolate_03': 8,
         'isolate_04': 8,
         'isolate_05': 8,
         'isolate_06': 8,
         'isolate_07': 8,
         'isolate_08': 8,
         'isolate_09': 8,
         'isolate_10': 8,
         'isolate_11': 8,
         'isolate_12': 8,
         'isolate_13': 8,
         'isolate_14': 8,
         'isolate_15': 8,
         'isolate_16': 8,
         'isolate_17': 8,
         'isolate_18': 8,
         'isolate_19': 8,
         'isolate_20': 8,
         'isolate_21': 8,
         'isolate_22': 8,
         'isolate_23': 8,
         'isolate_24': 8,
         'isolate_25': 8,
         'isolate_26': 8,
         'isolate_27': 32,
         'isolate_28': 8,
         'isolate_29': 8,
         'isolate_30': 8})
Counter(test_data.Isolate)
Counter({'isolate_31': 8,
         'isolate_32': 8,
         'isolate_33': 8,
         'isolate_34': 8,
         'isolate_35': 8,
         'isolate_36': 8,
         'isolate_37': 8,
         'isolate_38': 8,
         'isolate_39': 8,
         'isolate_40': 8,
         'isolate_41': 8,
         'isolate_42': 8,
         'isolate_43': 8,
         'isolate_44': 8,
         'isolate_45': 8,
         'isolate_46': 8,
         'isolate_47': 8,
         'isolate_48': 8,
         'isolate_49': 8,
         'isolate_50': 8,
         'isolate_51': 8,
         'isolate_52': 8,
         'isolate_53': 8,
         'isolate_54': 8,
         'isolate_55': 8})

可以看到训练集和测试集并不共享ioslate,因此不具有使用价值。

Counter(train_data.Timepoint)
Counter({'24HR': 136, '6HR': 136})
Counter(test_data.Timepoint)
Counter({'24HR': 100, '6HR': 100})
Counter(train_data.Treatment)
Counter({'DHA': 136, 'UT': 136})
Counter(test_data.Treatment)
Counter({'DHA': 100, 'UT': 100})
Counter(train_data.BioRep)
Counter({'BRep1': 120,
         'BRep2': 120,
         'BRep3': 8,
         'BRep4': 8,
         'BRep5': 4,
         'BRep6': 4,
         'BRep7': 4,
         'BRep8': 4})
Counter(test_data.BioRep)
Counter({'BRep1': 100, 'BRep2': 100})

相关系数筛选法降维

CORR = train_data.corr('spearman')
CORRDHA = list(CORR.DHA_IC50)
f, (ax1) = plt.subplots(1, 1, sharex=True, figsize=(6, 3))
sns.kdeplot(CORRDHA, shade=True, color=c1, label = 'cor for DHA_IC50',ax=ax1)
select_cor = [i for i in range(len(CORRDHA)) if abs(CORRDHA[i]) > 0.12]

青蒿素抗疟疾效果预测比赛

len(select_cor)
35
colnames = list(CORR.columns)
colnamess = [colnames[i] for i in select_cor]
dropnames = list(set(colnames)-set(colnamess))
len(dropnames)
5506
train_data.drop(dropnames, inplace=True, axis = 1)
train_data
Sample_Name Isolate Timepoint Treatment BioRep MAL1.59416.59687....kr...can PF3D7_0108200 PF3D7_0115400 PF3D7_0201000 PF3D7_0201500 ... PF3D7_1247200 PF3D7_1321000 PF3D7_1332500 PF3D7_1337700 PF3D7_1401000 PF3D7_1431300 PF3D7_1445900 PF3D7_1452200 PF3D7_1456900 DHA_IC50
0 isolate_01.24HR.DHA.BRep1 isolate_01 24HR DHA BRep1 -0.443936 -0.100675 -2.709971 -3.213864 -1.091086 ... -0.985147 0.597583 -1.155107 0.566342 -3.282937 -0.828819 -0.988564 1.273673 -1.168364 2.177
1 isolate_01.24HR.DHA.BRep2 isolate_01 24HR DHA BRep2 -1.186611 0.735564 -2.472694 -3.924790 -0.645725 ... -0.855834 0.246352 -2.062505 0.728823 -2.569909 0.395816 -0.221650 0.342256 0.356901 2.177
2 isolate_01.24HR.UT.BRep1 isolate_01 24HR UT BRep1 -0.538092 1.220332 -1.666902 -5.039397 1.507344 ... -0.255342 -0.056962 -0.741269 -0.357539 -1.195457 0.443488 0.627498 0.386491 1.687583 2.177
3 isolate_01.24HR.UT.BRep2 isolate_01 24HR UT BRep2 -0.828352 0.575415 -2.744366 -5.905546 -0.568281 ... -0.558252 0.201290 -0.252612 1.202704 -1.468765 0.700349 0.024436 -0.014673 0.206773 2.177
4 isolate_01.6HR.DHA.BRep1 isolate_01 6HR DHA BRep1 -1.481881 0.283878 -2.322445 -5.374090 0.319846 ... -0.422696 -0.244172 -0.013380 0.468331 -1.030617 0.364250 -0.580314 0.564132 -0.431109 2.177
5 isolate_01.6HR.DHA.BRep2 isolate_01 6HR DHA BRep2 -0.818662 0.639680 -2.786851 -5.406612 -0.960168 ... -1.585799 0.386760 -0.882748 1.510844 -2.030041 0.529667 -0.535430 0.156222 -1.313564 2.177
6 isolate_01.6HR.UT.BRep1 isolate_01 6HR UT BRep1 -1.441343 0.543945 -0.516681 -3.725347 -3.973098 ... -0.908802 1.008485 -0.722783 -0.004425 -1.579508 0.668031 -0.381967 0.866779 -0.277187 2.177
7 isolate_01.6HR.UT.BRep2 isolate_01 6HR UT BRep2 -1.424590 0.984280 -3.440088 -5.533135 -0.368167 ... -0.302371 -0.258883 -0.655029 0.702457 -2.897724 1.122203 0.288328 0.671121 0.862057 2.177
8 isolate_02.24HR.DHA.BRep1 isolate_02 24HR DHA BRep1 -1.484605 0.833452 -3.422781 -5.319162 -0.373591 ... -1.036664 0.266881 -0.922936 1.320746 -2.301671 0.732093 -0.254396 -0.287382 -0.564506 1.697
9 isolate_02.24HR.DHA.BRep2 isolate_02 24HR DHA BRep2 -0.686910 1.566094 -3.461451 -6.284323 0.606515 ... -0.113845 -0.066738 0.238364 -0.359802 -1.050140 1.392860 0.348988 1.027637 1.305501 1.697
10 isolate_02.24HR.DHA.BRep3 isolate_02 24HR DHA BRep3 -1.337437 0.870020 -1.086237 -4.927345 -0.236146 ... -1.136695 0.139889 -0.690148 0.711058 -2.461177 0.563514 0.074102 0.008713 0.097603 1.697
11 isolate_02.24HR.DHA.BRep4 isolate_02 24HR DHA BRep4 -1.144765 0.603688 -2.186673 -5.445916 -1.060517 ... -0.355237 0.179440 -0.413934 0.418851 -1.121716 0.945285 0.304349 0.240995 0.413619 1.697
12 isolate_02.24HR.UT.BRep1 isolate_02 24HR UT BRep1 -0.969815 0.428852 -2.079885 -3.698236 -1.010320 ... -1.362287 0.869552 0.897190 1.477135 -1.459841 0.131137 -0.968894 0.530704 -1.180350 1.697
13 isolate_02.24HR.UT.BRep2 isolate_02 24HR UT BRep2 -0.831103 1.055918 -3.294669 -6.621351 -0.365654 ... -0.215727 -0.105166 0.295103 0.566553 -1.289015 1.008094 0.539046 -0.099084 0.611115 1.697
14 isolate_02.24HR.UT.BRep3 isolate_02 24HR UT BRep3 -1.067333 0.453303 -2.675612 -6.288987 -0.541833 ... -1.013739 0.328516 -0.465013 0.451698 -1.176785 0.396625 -0.200786 0.523071 -0.903835 1.697
15 isolate_02.24HR.UT.BRep4 isolate_02 24HR UT BRep4 -0.839616 0.446568 -2.450340 -5.608584 -0.604042 ... -0.957355 0.489947 -1.122378 0.607087 -0.741156 0.512648 -0.257656 0.610839 -0.684762 1.697
16 isolate_02.6HR.DHA.BRep1 isolate_02 6HR DHA BRep1 -0.792754 0.958172 -0.852579 -5.006698 -0.828517 ... -1.377291 0.159294 -0.532192 1.358049 -2.365829 0.281288 -0.214322 -0.123179 -0.640692 1.697
17 isolate_02.6HR.DHA.BRep2 isolate_02 6HR DHA BRep2 -1.046670 0.098186 -1.771210 -5.853528 -0.305286 ... -0.837113 0.295548 -0.113224 0.320856 -0.736279 -0.230788 -0.082347 0.430137 -0.242890 1.697
18 isolate_02.6HR.DHA.BRep3 isolate_02 6HR DHA BRep3 -1.588809 0.739118 -2.904774 -5.202351 1.385476 ... 0.199116 -0.477491 0.665402 0.022030 -0.120614 0.942747 0.473917 -0.051604 0.569513 1.697
19 isolate_02.6HR.DHA.BRep4 isolate_02 6HR DHA BRep4 -0.680281 1.448573 -2.357755 -4.116196 -1.124109 ... -0.920867 0.065383 -1.071213 1.024844 -1.541809 0.218041 -0.326610 0.665595 0.324805 1.697
20 isolate_02.6HR.UT.BRep1 isolate_02 6HR UT BRep1 -1.085373 0.531026 -2.309487 -4.641513 0.168090 ... -0.943166 0.290954 -0.582360 0.267773 -1.455443 0.461292 -0.242356 0.449962 -0.752169 1.697
21 isolate_02.6HR.UT.BRep2 isolate_02 6HR UT BRep2 -0.834661 0.536283 -2.612337 -3.851328 0.428547 ... -1.244496 0.659073 0.976006 0.427998 -1.899695 0.343565 -0.656701 0.487371 -0.918014 1.697
22 isolate_02.6HR.UT.BRep3 isolate_02 6HR UT BRep3 -1.360318 0.809453 -2.452710 -6.027356 -1.344992 ... -0.509423 -0.115217 -0.116530 0.561582 -2.109954 0.945989 -0.108039 0.227384 -0.302962 1.697
23 isolate_02.6HR.UT.BRep4 isolate_02 6HR UT BRep4 -0.480521 0.341766 -0.738533 -5.046426 -0.950383 ... -1.351784 0.264851 -0.608725 1.199647 -1.639487 0.502498 -0.400176 -0.031891 -0.520738 1.697
24 isolate_03.24HR.DHA.BRep1 isolate_03 24HR DHA BRep1 -1.199577 0.598246 -1.248158 -5.855809 -0.580605 ... -1.247383 0.258670 -0.354541 1.161986 -2.014706 0.339105 -0.576505 0.500075 -0.154729 1.799
25 isolate_03.24HR.DHA.BRep2 isolate_03 24HR DHA BRep2 -1.460192 1.927354 -3.237152 -5.949091 0.449967 ... -0.225111 -0.674256 -0.515577 0.555073 -0.781383 0.709991 0.203399 -0.297557 1.779288 1.799
26 isolate_03.24HR.UT.BRep1 isolate_03 24HR UT BRep1 -1.257227 0.447835 -1.850317 -6.961826 -0.975635 ... -1.524692 1.023201 -0.745054 0.757417 -2.138532 0.075439 -0.825895 1.081647 -1.062411 1.799
27 isolate_03.24HR.UT.BRep2 isolate_03 24HR UT BRep2 -0.698684 0.895778 -3.177363 -2.526073 -0.319774 ... -0.693305 -0.389737 -0.736943 0.274204 -2.081892 0.384307 -0.071042 0.133207 1.144208 1.799
28 isolate_03.6HR.DHA.BRep1 isolate_03 6HR DHA BRep1 -0.966405 0.730976 -2.410188 -2.892526 -0.613145 ... -1.061740 0.239158 -1.441044 1.338106 -1.655025 0.208669 -0.087373 0.049857 -0.479941 1.799
29 isolate_03.6HR.DHA.BRep2 isolate_03 6HR DHA BRep2 -1.468334 0.763875 -3.365404 -5.396825 1.169199 ... 0.448757 -0.539882 -0.241445 -0.080214 -2.150805 1.185146 0.641634 -0.185741 0.556788 1.799
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
242 isolate_27.6HR.UT.BRep3 isolate_27 6HR UT BRep3 -1.479733 0.873729 -2.995090 -5.534704 0.788175 ... -0.390459 0.098318 -0.822239 -0.058461 -1.467940 0.686608 0.178390 0.217990 -0.080514 1.348
243 isolate_27.6HR.UT.BRep4 isolate_27 6HR UT BRep4 0.255800 0.743422 0.120315 -4.791222 -1.000942 ... -1.588499 0.668209 -0.494822 0.787637 -1.486726 0.529615 -0.708677 0.979776 -1.327158 1.348
244 isolate_27.6HR.UT.BRep5 isolate_27 6HR UT BRep5 -0.850431 1.315075 -2.330418 -3.352280 -2.052225 ... -1.005504 -0.035619 0.314385 0.773208 -2.199026 0.135653 -0.308070 0.372287 -0.060587 1.348
245 isolate_27.6HR.UT.BRep6 isolate_27 6HR UT BRep6 0.101324 1.137361 -1.836046 -6.264413 -1.192355 ... -1.003004 0.180599 -0.679150 1.154996 -1.423278 0.444403 -0.147474 0.396718 -0.379247 1.348
246 isolate_27.6HR.UT.BRep7 isolate_27 6HR UT BRep7 -1.592814 0.733240 -3.359684 -6.359498 -0.972636 ... -1.208499 0.014496 -1.149130 0.525342 -1.517542 0.384216 0.068098 0.659580 0.714603 1.348
247 isolate_27.6HR.UT.BRep8 isolate_27 6HR UT BRep8 -0.585143 0.345710 -2.985263 -3.598660 -0.677759 ... -0.874534 0.075711 -0.910279 0.736739 -1.940898 0.251690 -0.172153 0.425761 -0.923786 1.348
248 isolate_28.24HR.DHA.BRep1 isolate_28 24HR DHA BRep1 -0.950635 1.058801 -3.279824 -5.804150 0.868882 ... -1.059134 0.122699 -0.275637 0.110776 -2.114496 0.837468 0.219821 0.148167 0.081042 1.453
249 isolate_28.24HR.DHA.BRep2 isolate_28 24HR DHA BRep2 -0.224792 0.857616 -3.837129 -6.450879 0.160659 ... -0.439133 0.133964 -0.270707 0.241441 -2.462835 0.231931 0.115247 0.235549 0.357950 1.453
250 isolate_28.24HR.UT.BRep1 isolate_28 24HR UT BRep1 -1.061311 0.417939 -2.945329 -5.687589 -1.113792 ... 0.005643 0.128742 0.250623 0.192165 -0.877959 0.569689 0.632991 0.530408 0.044373 1.453
251 isolate_28.24HR.UT.BRep2 isolate_28 24HR UT BRep2 -0.572639 0.510610 -1.362488 -2.825827 -3.437945 ... -0.517880 0.591626 -1.086417 0.708568 -0.961700 0.243299 -0.830774 0.962087 -0.540670 1.453
252 isolate_28.6HR.DHA.BRep1 isolate_28 6HR DHA BRep1 -0.786347 -0.316650 -3.093150 -5.671040 -1.063638 ... -0.562073 0.479376 -0.472837 0.200602 -1.436363 0.095742 -0.020455 0.899690 -1.051996 1.453
253 isolate_28.6HR.DHA.BRep2 isolate_28 6HR DHA BRep2 -0.965268 1.059539 -2.754323 -4.151617 -0.277674 ... -0.911576 0.235208 -0.692463 1.140874 -2.454726 0.139945 -0.329656 -0.081388 -0.091958 1.453
254 isolate_28.6HR.UT.BRep1 isolate_28 6HR UT BRep1 -0.276988 0.415027 -0.479694 -4.845695 -1.132520 ... -1.352655 0.202506 -0.205722 0.732571 -0.978727 -0.116941 -0.094547 0.623027 0.122087 1.453
255 isolate_28.6HR.UT.BRep2 isolate_28 6HR UT BRep2 -0.785559 1.026239 -2.357299 -4.837051 -0.343913 ... -0.649602 -0.067990 1.150306 0.421680 -1.818289 0.865032 0.078230 0.173118 1.072409 1.453
256 isolate_29.24HR.DHA.BRep1 isolate_29 24HR DHA BRep1 -1.115310 0.716736 -1.403494 -5.464998 -0.433233 ... -0.726112 0.452986 0.101051 1.499581 -1.864173 -0.064026 -0.659352 0.402915 -0.410209 1.990
257 isolate_29.24HR.DHA.BRep2 isolate_29 24HR DHA BRep2 -1.280405 1.129781 -1.837170 -5.051021 -0.659166 ... -0.690904 0.031148 0.029593 1.368209 -2.201507 0.269939 -0.114870 -0.245257 0.046548 1.990
258 isolate_29.24HR.UT.BRep1 isolate_29 24HR UT BRep1 -0.654726 0.567147 -0.930625 -5.414064 -1.018953 ... -1.268580 0.025631 -0.311755 1.060053 -1.700597 0.287550 0.141410 -0.006351 -0.076215 1.990
259 isolate_29.24HR.UT.BRep2 isolate_29 24HR UT BRep2 -0.563394 0.439543 -1.676543 -3.170102 -3.343840 ... -0.813006 -0.137299 0.273063 0.565321 -0.821248 0.243620 0.015184 0.388667 -0.222223 1.990
260 isolate_29.6HR.DHA.BRep1 isolate_29 6HR DHA BRep1 -1.554723 0.730785 -3.438687 -5.447302 -0.314840 ... -1.267192 0.186690 -1.746578 1.520663 -2.636001 0.754605 -0.475425 0.248404 -0.613732 1.990
261 isolate_29.6HR.DHA.BRep2 isolate_29 6HR DHA BRep2 -0.636178 0.172809 -2.696966 -5.626919 0.109564 ... -0.883134 0.582260 -0.300453 0.101378 -0.786520 0.356458 -0.151425 0.701458 -0.536342 1.990
262 isolate_29.6HR.UT.BRep1 isolate_29 6HR UT BRep1 -0.296083 0.423375 -1.095481 -6.484121 -0.103989 ... -1.318517 1.397622 -1.813682 1.143219 -2.117421 0.096241 -1.361130 1.634048 0.247918 1.990
263 isolate_29.6HR.UT.BRep2 isolate_29 6HR UT BRep2 -1.813041 0.890928 -2.479438 -6.026245 0.827461 ... -0.377092 -0.580702 0.131261 -0.492634 -0.935739 0.708170 0.247816 0.066255 0.315679 1.990
264 isolate_30.24HR.DHA.BRep1 isolate_30 24HR DHA BRep1 -0.975989 1.164478 -3.481287 -6.037603 0.207426 ... -0.379305 -0.228789 0.734703 0.248937 -0.994332 0.875343 0.388887 0.076434 0.612181 1.363
265 isolate_30.24HR.DHA.BRep2 isolate_30 24HR DHA BRep2 -0.471030 0.731458 -2.543038 -5.734210 -2.323102 ... -1.619158 1.421581 -1.248834 0.715166 -2.319369 0.207114 -1.059442 1.005176 -1.289392 1.363
266 isolate_30.24HR.UT.BRep1 isolate_30 24HR UT BRep1 0.042976 0.558593 -1.663835 -2.616689 -3.664556 ... -1.335481 1.120739 -0.864432 1.163530 -2.652660 0.296817 -0.441023 0.831882 -0.964654 1.363
267 isolate_30.24HR.UT.BRep2 isolate_30 24HR UT BRep2 -0.837299 0.317409 -2.262827 -5.933248 -1.007488 ... -1.200081 1.336456 -1.286053 0.803294 -2.141285 -0.001567 -1.079483 0.940947 -1.248146 1.363
268 isolate_30.6HR.DHA.BRep1 isolate_30 6HR DHA BRep1 -0.547496 1.278242 -2.953364 -4.714413 -0.300387 ... -0.621518 1.165279 0.231229 0.605580 -2.454108 1.003054 -0.588209 0.057602 -0.170505 1.363
269 isolate_30.6HR.DHA.BRep2 isolate_30 6HR DHA BRep2 -1.068673 0.473819 -3.041698 -3.134905 -0.066007 ... 0.042170 0.310388 0.486209 0.325325 -0.454830 0.222535 0.447306 0.425515 0.315055 1.363
270 isolate_30.6HR.UT.BRep1 isolate_30 6HR UT BRep1 -1.130075 0.845345 -2.759546 -5.439546 1.166847 ... -0.809479 0.309938 -0.378315 0.090742 -1.698631 0.772424 -0.495931 0.319449 -1.074312 1.363
271 isolate_30.6HR.UT.BRep2 isolate_30 6HR UT BRep2 -1.593430 0.857925 -3.017005 -6.795783 0.233365 ... -0.523162 0.002438 -0.258810 0.178586 -1.783412 0.647358 0.556913 0.159119 0.983065 1.363

272 rows × 40 columns

数据处理

train_data.drop(['Sample_Name'], inplace=True, axis = 1)
train_data = train_data[[x in ['BRep1', 'BRep2'] for x in train_data['BioRep']]]
temp = list(train_data.Timepoint)
map1 = {'24HR': 0, '6HR': 1}
train_data.Timepoint = [map1[x] for x in temp]
temp = list(train_data.Treatment)
map1 = {'DHA': 0, 'UT': 1}
train_data.Treatment = [map1[x] for x in temp]
temp = list(train_data.BioRep)
map1 = {'BRep1': 0, 'BRep2': 1}
train_data.BioRep = [map1[x] for x in temp]
train_data.head()
Isolate Timepoint Treatment BioRep MAL1.59416.59687....kr...can PF3D7_0108200 PF3D7_0115400 PF3D7_0201000 PF3D7_0201500 PF3D7_0201600 ... PF3D7_1247200 PF3D7_1321000 PF3D7_1332500 PF3D7_1337700 PF3D7_1401000 PF3D7_1431300 PF3D7_1445900 PF3D7_1452200 PF3D7_1456900 DHA_IC50
0 isolate_01 0 0 0 -0.443936 -0.100675 -2.709971 -3.213864 -1.091086 -1.111358 ... -0.985147 0.597583 -1.155107 0.566342 -3.282937 -0.828819 -0.988564 1.273673 -1.168364 2.177
1 isolate_01 0 0 1 -1.186611 0.735564 -2.472694 -3.924790 -0.645725 -0.681182 ... -0.855834 0.246352 -2.062505 0.728823 -2.569909 0.395816 -0.221650 0.342256 0.356901 2.177
2 isolate_01 0 1 0 -0.538092 1.220332 -1.666902 -5.039397 1.507344 1.117250 ... -0.255342 -0.056962 -0.741269 -0.357539 -1.195457 0.443488 0.627498 0.386491 1.687583 2.177
3 isolate_01 0 1 1 -0.828352 0.575415 -2.744366 -5.905546 -0.568281 -0.055037 ... -0.558252 0.201290 -0.252612 1.202704 -1.468765 0.700349 0.024436 -0.014673 0.206773 2.177
4 isolate_01 1 0 0 -1.481881 0.283878 -2.322445 -5.374090 0.319846 -0.059816 ... -0.422696 -0.244172 -0.013380 0.468331 -1.030617 0.364250 -0.580314 0.564132 -0.431109 2.177

5 rows × 39 columns

随机森林搜索超参数

from sklearn.model_selection import train_test_split
train,valid = train_test_split(train_data,test_size = 30,stratify=train_data['Isolate'])
train_dha = train.DHA_IC50
valid_dha = valid.DHA_IC50
train_feature = train.drop(['DHA_IC50', 'Isolate'], axis = 1)
valid_feature = valid.drop(['DHA_IC50', 'Isolate'], axis = 1)
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=1, oob_score=True,n_estimators=200)
import numpy as np
param_grid = {"max_features":list(np.arange(0.66, 0.97, 0.03)),
              "min_samples_split":list(np.arange(5, 50, 2)),
              "min_samples_leaf":list(np.arange(1, 8, 1)),
               "max_depth":list(np.arange(4, 10, 1))
             }
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(rf,param_grid,cv=5)
grid_search.fit(train_feature,train_dha)
print("Best parameters:{}".format(grid_search.best_params_))
Best parameters:{'max_depth': 9, 'max_features': 0.8100000000000002, 'min_samples_leaf': 6, 'min_samples_split': 19}

线性回归

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
sc.fit(train_feature) # 估算每个特征的平均值和标准差
ttrain_feature = sc.transform(train_feature)
# 注意:这里要用同样的参数来标准化测试集保证一致性
vvalid_feature = sc.transform(valid_feature)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(ttrain_feature, train_dha)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
evalution(train_dha, lr.predict(ttrain_feature))
evalution(valid_dha, lr.predict(vvalid_feature))
相关系数为: 0.4272672215972652  置信度为: 99.99999999003602 %
相关系数为: 0.06696329254727475  置信度为: 27.48492169435901 %

对结果进行评估

from scipy.stats import spearmanr
def evalution(x, y):
    coef, p = spearmanr(x, y)
    print('相关系数为:', coef, " 置信度为:", (1-p)*100, "%")
evalution(train_dha, grid_search.predict(train_feature))
evalution(valid_dha, grid_search.predict(valid_feature))
相关系数为: 0.8908044428995714  置信度为: 100.0 %
相关系数为: 0.25428253615127916  置信度为: 82.4899021040629 %
rf = RandomForestRegressor(max_depth=  9, max_features = 0.8100000000000002, min_samples_leaf = 6, min_samples_split = 19, random_state=1, oob_score=True,n_estimators=300000)
rf.fit(train_feature, train_dha)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=9,
           max_features=0.8100000000000002, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=6, min_samples_split=19,
           min_weight_fraction_leaf=0.0, n_estimators=300000, n_jobs=None,
           oob_score=True, random_state=1, verbose=0, warm_start=False)
evalution(train_dha, rf.predict(train_feature))
evalution(valid_dha, rf.predict(valid_feature))
rf.get_params
相关系数为: 0.8959320670290216  置信度为: 100.0 %
相关系数为: 0.3027808676307008  置信度为: 89.61282920162074 %



<bound method BaseEstimator.get_params of RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=9,
           max_features=0.8100000000000002, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=6, min_samples_split=19,
           min_weight_fraction_leaf=0.0, n_estimators=300000, n_jobs=None,
           oob_score=True, random_state=1, verbose=0, warm_start=False)>
Target = []
Mean = []
Median = []
Min = []
Max = []
name = ""
for i in range(1, 31):
    if i < 10:
        name = "isolate_0"+str(i)
    else:
        name = "isolate_"+str(i)
    temp = train_data[train_data['Isolate'] == name]
    target = temp.DHA_IC50
    feature = temp.drop(['DHA_IC50', 'Isolate'], axis = 1)
    predicts = rf.predict(feature)
    Target.append(target.mean())
    Mean.append(predicts.mean())
name = ""
for i in range(1, 31):
    if i < 10:
        name = "isolate_0"+str(i)
    else:
        name = "isolate_"+str(i)
    temp = train_data[train_data['Isolate'] == name]
    target = temp.DHA_IC50
    feature = temp.drop(['DHA_IC50', 'Isolate'], axis = 1)
    predicts = rf.predict(feature)
    Median.append(np.median(predicts))
name = ""
for i in range(1, 31):
    if i < 10:
        name = "isolate_0"+str(i)
    else:
        name = "isolate_"+str(i)
    temp = train_data[train_data['Isolate'] == name]
    target = temp.DHA_IC50
    feature = temp.drop(['DHA_IC50', 'Isolate'], axis = 1)
    predicts = rf.predict(feature)
    Max.append(np.max(predicts))
name = ""
for i in range(1, 31):
    if i < 10:
        name = "isolate_0"+str(i)
    else:
        name = "isolate_"+str(i)
    temp = train_data[train_data['Isolate'] == name]
    target = temp.DHA_IC50
    feature = temp.drop(['DHA_IC50', 'Isolate'], axis = 1)
    predicts = rf.predict(feature)
    Min.append(np.min(predicts))
from sklearn.externals import joblib
joblib.dump(rf, 'rf.pkl')
['rf.pkl']
import itertools
Target = list(itertools.chain.from_iterable(itertools.repeat(x, 8) for x in Target))
Mean = list(itertools.chain.from_iterable(itertools.repeat(x, 8) for x in Mean))
Median = list(itertools.chain.from_iterable(itertools.repeat(x, 8) for x in Median))
Max = list(itertools.chain.from_iterable(itertools.repeat(x, 8) for x in Max))
Min = list(itertools.chain.from_iterable(itertools.repeat(x, 8) for x in Min))
predicts = rf.predict(train_data.drop(['DHA_IC50', 'Isolate'], axis = 1))
evalution(Target, predicts)
evalution(Target, Mean)
evalution(Target, Median)
evalution(Target, Max)
evalution(Target, Min)
相关系数为: 0.8303088922694587  置信度为: 100.0 %
相关系数为: 0.9777530589543939  置信度为: 100.0 %
相关系数为: 0.974193548387097  置信度为: 100.0 %
相关系数为: 0.8327030033370413  置信度为: 100.0 %
相关系数为: 0.8909899888765297  置信度为: 100.0 %

获得计算结果

test_data.drop(dropnames, inplace=True, axis = 1)
test_data.drop(['Sample_Names'], inplace=True, axis = 1)
temp = list(test_data.Timepoint)
map1 = {'24HR': 0, '6HR': 1}
test_data.Timepoint = [map1[x] for x in temp]
temp = list(test_data.Treatment)
map1 = {'DHA': 0, 'UT': 1}
test_data.Treatment = [map1[x] for x in temp]
temp = list(test_data.BioRep)
map1 = {'BRep1': 0, 'BRep2': 1}
test_data.BioRep = [map1[x] for x in temp]
Names = []
MMeans = []
for i in range(31, 56):
    name = "isolate_"+str(i)
    temp = test_data[test_data['Isolate'] == name]
    feature = temp.drop(['DHA_IC50', 'Isolate'], axis = 1)
    predicts = rf.predict(feature)
    Names.append(name)
    MMeans.append(predicts.mean())
dataframe = pd.DataFrame({'Isolate': list(Names),'Predicted_IC50':MMeans})
dataframe.to_csv("SubCh1_Submission.txt",sep='\t', index=False)
相关标签: 预测 随机森林