青蒿素抗疟疾效果预测比赛
程序员文章站
2022-03-22 18:00:40
...
比赛网址及说明
https://www.synapse.org/#!Synapse:syn16924919/wiki/583955
读取文件并简单分析
import pandas as pd
train_data = pd.read_csv('./SubCh1_TrainingData.csv')
test_data = pd.read_csv('./SubCh1_TestData.csv')
train_data.head()
Sample_Name | Isolate | Timepoint | Treatment | BioRep | MAL1.465720.465873.bc..rv.. | MAL1.48287.48430....kr...can | MAL1.562126.562246.bc..r... | MAL1.59416.59687....kr...can | MAL1.616633.616682.......y. | ... | PF3D7_1479200 | PF3D7_1479400 | PF3D7_1479500 | PF3D7_1479600 | PF3D7_1479700 | PF3D7_1479800 | PF3D7_1479900 | PF3D7_1480000 | PF3D7_1480100 | DHA_IC50 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | isolate_01.24HR.DHA.BRep1 | isolate_01 | 24HR | DHA | BRep1 | 0.008286 | -0.959492 | 0.206468 | -0.443936 | 0.453030 | ... | -0.540993 | -3.674097 | -1.652979 | -2.255490 | -4.554757 | -0.381422 | -1.415857 | -4.121011 | -2.486528 | 2.177 |
1 | isolate_01.24HR.DHA.BRep2 | isolate_01 | 24HR | DHA | BRep2 | -0.872028 | -1.968558 | -2.308563 | -1.186611 | 0.508575 | ... | 0.450649 | -4.464408 | -0.977954 | -2.012559 | -4.538550 | -2.333890 | -2.342496 | -4.774197 | -1.794568 | 2.177 |
2 | isolate_01.24HR.UT.BRep1 | isolate_01 | 24HR | UT | BRep1 | 0.039480 | -1.291371 | -0.946661 | -0.538092 | -1.778501 | ... | 1.411557 | -5.500522 | -1.764229 | -1.947208 | -5.331037 | -0.814417 | -2.141285 | -3.922684 | -2.495169 | 2.177 |
3 | isolate_01.24HR.UT.BRep2 | isolate_01 | 24HR | UT | BRep2 | 0.125177 | -1.462758 | -1.888105 | -0.828352 | -1.410803 | ... | -0.659519 | -5.011207 | -0.559471 | -2.266763 | -5.289318 | -2.949869 | -2.206432 | -4.324299 | -1.735312 | 2.177 |
4 | isolate_01.6HR.DHA.BRep1 | isolate_01 | 6HR | DHA | BRep1 | 1.354956 | -2.139128 | -0.541696 | -1.481881 | -0.735798 | ... | -1.687048 | -4.773306 | -0.048847 | -2.023727 | -4.398358 | -3.080500 | -0.566545 | -4.798190 | -0.821690 | 2.177 |
5 rows × 5546 columns
test_data.head()
Sample_Names | Isolate | Timepoint | Treatment | BioRep | MAL1.465720.465873.bc..rv.. | MAL1.48287.48430....kr...can | MAL1.562126.562246.bc..r... | MAL1.59416.59687....kr...can | MAL1.616633.616682.......y. | ... | PF3D7_1479200 | PF3D7_1479400 | PF3D7_1479500 | PF3D7_1479600 | PF3D7_1479700 | PF3D7_1479800 | PF3D7_1479900 | PF3D7_1480000 | PF3D7_1480100 | DHA_IC50 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | isolate_31.24HR.DHA.BRep1 | isolate_31 | 24HR | DHA | BRep1 | 0.416583 | -2.304394 | -1.279308 | -1.128485 | 0.129698 | ... | 0.117763 | -1.650283 | -0.520373 | -1.301428 | -5.326124 | -0.352736 | -1.567931 | -4.446439 | -1.875440 | NaN |
1 | isolate_31.24HR.DHA.BRep2 | isolate_31 | 24HR | DHA | BRep2 | 0.247195 | -2.176939 | -1.470318 | -0.676434 | -0.206543 | ... | 0.472336 | -1.151134 | -0.342707 | -1.001569 | -5.337602 | -0.672201 | -1.494983 | -4.299427 | -1.553652 | NaN |
2 | isolate_31.24HR.UT.BRep1 | isolate_31 | 24HR | UT | BRep1 | -0.197699 | -0.572531 | 0.270896 | 0.557528 | -0.698585 | ... | -0.309768 | -6.330446 | -0.850889 | -1.603832 | -3.256602 | -2.517157 | -0.650735 | -4.841146 | 0.153951 | NaN |
3 | isolate_31.24HR.UT.BRep2 | isolate_31 | 24HR | UT | BRep2 | 0.966916 | -1.723830 | -1.102392 | -0.797996 | 0.032847 | ... | -0.254186 | -1.516661 | -1.916219 | -1.858716 | -6.483167 | -0.827476 | -1.454423 | -5.264615 | -2.367444 | NaN |
4 | isolate_31.6HR.DHA.BRep1 | isolate_31 | 6HR | DHA | BRep1 | 0.125273 | -1.665749 | -2.510245 | -1.454094 | -1.763451 | ... | 0.653097 | -2.383878 | -2.731928 | -2.557962 | -5.697488 | -3.466542 | -2.147289 | -4.358510 | -1.929927 | NaN |
5 rows × 5546 columns
import seaborn as sns
import matplotlib.pyplot as plt
# 利用kdeplot来确定两个sample data 是否来自于同一总体
f, (ax) = plt.subplots(1, 1, sharex=True, figsize=(8, 6))
c1, c2 = sns.color_palette('Set1', 2)
# dist1, dist2是两个近似正态数据, 拥有相同的中心和摆动程度
sns.kdeplot(train_data.DHA_IC50, shade=True, color=c1, label = 'DHA_IC50',ax=ax)
from collections import Counter
Counter(train_data.Isolate)
Counter({'isolate_01': 8,
'isolate_02': 16,
'isolate_03': 8,
'isolate_04': 8,
'isolate_05': 8,
'isolate_06': 8,
'isolate_07': 8,
'isolate_08': 8,
'isolate_09': 8,
'isolate_10': 8,
'isolate_11': 8,
'isolate_12': 8,
'isolate_13': 8,
'isolate_14': 8,
'isolate_15': 8,
'isolate_16': 8,
'isolate_17': 8,
'isolate_18': 8,
'isolate_19': 8,
'isolate_20': 8,
'isolate_21': 8,
'isolate_22': 8,
'isolate_23': 8,
'isolate_24': 8,
'isolate_25': 8,
'isolate_26': 8,
'isolate_27': 32,
'isolate_28': 8,
'isolate_29': 8,
'isolate_30': 8})
Counter(test_data.Isolate)
Counter({'isolate_31': 8,
'isolate_32': 8,
'isolate_33': 8,
'isolate_34': 8,
'isolate_35': 8,
'isolate_36': 8,
'isolate_37': 8,
'isolate_38': 8,
'isolate_39': 8,
'isolate_40': 8,
'isolate_41': 8,
'isolate_42': 8,
'isolate_43': 8,
'isolate_44': 8,
'isolate_45': 8,
'isolate_46': 8,
'isolate_47': 8,
'isolate_48': 8,
'isolate_49': 8,
'isolate_50': 8,
'isolate_51': 8,
'isolate_52': 8,
'isolate_53': 8,
'isolate_54': 8,
'isolate_55': 8})
可以看到训练集和测试集并不共享ioslate,因此不具有使用价值。
Counter(train_data.Timepoint)
Counter({'24HR': 136, '6HR': 136})
Counter(test_data.Timepoint)
Counter({'24HR': 100, '6HR': 100})
Counter(train_data.Treatment)
Counter({'DHA': 136, 'UT': 136})
Counter(test_data.Treatment)
Counter({'DHA': 100, 'UT': 100})
Counter(train_data.BioRep)
Counter({'BRep1': 120,
'BRep2': 120,
'BRep3': 8,
'BRep4': 8,
'BRep5': 4,
'BRep6': 4,
'BRep7': 4,
'BRep8': 4})
Counter(test_data.BioRep)
Counter({'BRep1': 100, 'BRep2': 100})
相关系数筛选法降维
CORR = train_data.corr('spearman')
CORRDHA = list(CORR.DHA_IC50)
f, (ax1) = plt.subplots(1, 1, sharex=True, figsize=(6, 3))
sns.kdeplot(CORRDHA, shade=True, color=c1, label = 'cor for DHA_IC50',ax=ax1)
select_cor = [i for i in range(len(CORRDHA)) if abs(CORRDHA[i]) > 0.12]
len(select_cor)
35
colnames = list(CORR.columns)
colnamess = [colnames[i] for i in select_cor]
dropnames = list(set(colnames)-set(colnamess))
len(dropnames)
5506
train_data.drop(dropnames, inplace=True, axis = 1)
train_data
Sample_Name | Isolate | Timepoint | Treatment | BioRep | MAL1.59416.59687....kr...can | PF3D7_0108200 | PF3D7_0115400 | PF3D7_0201000 | PF3D7_0201500 | ... | PF3D7_1247200 | PF3D7_1321000 | PF3D7_1332500 | PF3D7_1337700 | PF3D7_1401000 | PF3D7_1431300 | PF3D7_1445900 | PF3D7_1452200 | PF3D7_1456900 | DHA_IC50 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | isolate_01.24HR.DHA.BRep1 | isolate_01 | 24HR | DHA | BRep1 | -0.443936 | -0.100675 | -2.709971 | -3.213864 | -1.091086 | ... | -0.985147 | 0.597583 | -1.155107 | 0.566342 | -3.282937 | -0.828819 | -0.988564 | 1.273673 | -1.168364 | 2.177 |
1 | isolate_01.24HR.DHA.BRep2 | isolate_01 | 24HR | DHA | BRep2 | -1.186611 | 0.735564 | -2.472694 | -3.924790 | -0.645725 | ... | -0.855834 | 0.246352 | -2.062505 | 0.728823 | -2.569909 | 0.395816 | -0.221650 | 0.342256 | 0.356901 | 2.177 |
2 | isolate_01.24HR.UT.BRep1 | isolate_01 | 24HR | UT | BRep1 | -0.538092 | 1.220332 | -1.666902 | -5.039397 | 1.507344 | ... | -0.255342 | -0.056962 | -0.741269 | -0.357539 | -1.195457 | 0.443488 | 0.627498 | 0.386491 | 1.687583 | 2.177 |
3 | isolate_01.24HR.UT.BRep2 | isolate_01 | 24HR | UT | BRep2 | -0.828352 | 0.575415 | -2.744366 | -5.905546 | -0.568281 | ... | -0.558252 | 0.201290 | -0.252612 | 1.202704 | -1.468765 | 0.700349 | 0.024436 | -0.014673 | 0.206773 | 2.177 |
4 | isolate_01.6HR.DHA.BRep1 | isolate_01 | 6HR | DHA | BRep1 | -1.481881 | 0.283878 | -2.322445 | -5.374090 | 0.319846 | ... | -0.422696 | -0.244172 | -0.013380 | 0.468331 | -1.030617 | 0.364250 | -0.580314 | 0.564132 | -0.431109 | 2.177 |
5 | isolate_01.6HR.DHA.BRep2 | isolate_01 | 6HR | DHA | BRep2 | -0.818662 | 0.639680 | -2.786851 | -5.406612 | -0.960168 | ... | -1.585799 | 0.386760 | -0.882748 | 1.510844 | -2.030041 | 0.529667 | -0.535430 | 0.156222 | -1.313564 | 2.177 |
6 | isolate_01.6HR.UT.BRep1 | isolate_01 | 6HR | UT | BRep1 | -1.441343 | 0.543945 | -0.516681 | -3.725347 | -3.973098 | ... | -0.908802 | 1.008485 | -0.722783 | -0.004425 | -1.579508 | 0.668031 | -0.381967 | 0.866779 | -0.277187 | 2.177 |
7 | isolate_01.6HR.UT.BRep2 | isolate_01 | 6HR | UT | BRep2 | -1.424590 | 0.984280 | -3.440088 | -5.533135 | -0.368167 | ... | -0.302371 | -0.258883 | -0.655029 | 0.702457 | -2.897724 | 1.122203 | 0.288328 | 0.671121 | 0.862057 | 2.177 |
8 | isolate_02.24HR.DHA.BRep1 | isolate_02 | 24HR | DHA | BRep1 | -1.484605 | 0.833452 | -3.422781 | -5.319162 | -0.373591 | ... | -1.036664 | 0.266881 | -0.922936 | 1.320746 | -2.301671 | 0.732093 | -0.254396 | -0.287382 | -0.564506 | 1.697 |
9 | isolate_02.24HR.DHA.BRep2 | isolate_02 | 24HR | DHA | BRep2 | -0.686910 | 1.566094 | -3.461451 | -6.284323 | 0.606515 | ... | -0.113845 | -0.066738 | 0.238364 | -0.359802 | -1.050140 | 1.392860 | 0.348988 | 1.027637 | 1.305501 | 1.697 |
10 | isolate_02.24HR.DHA.BRep3 | isolate_02 | 24HR | DHA | BRep3 | -1.337437 | 0.870020 | -1.086237 | -4.927345 | -0.236146 | ... | -1.136695 | 0.139889 | -0.690148 | 0.711058 | -2.461177 | 0.563514 | 0.074102 | 0.008713 | 0.097603 | 1.697 |
11 | isolate_02.24HR.DHA.BRep4 | isolate_02 | 24HR | DHA | BRep4 | -1.144765 | 0.603688 | -2.186673 | -5.445916 | -1.060517 | ... | -0.355237 | 0.179440 | -0.413934 | 0.418851 | -1.121716 | 0.945285 | 0.304349 | 0.240995 | 0.413619 | 1.697 |
12 | isolate_02.24HR.UT.BRep1 | isolate_02 | 24HR | UT | BRep1 | -0.969815 | 0.428852 | -2.079885 | -3.698236 | -1.010320 | ... | -1.362287 | 0.869552 | 0.897190 | 1.477135 | -1.459841 | 0.131137 | -0.968894 | 0.530704 | -1.180350 | 1.697 |
13 | isolate_02.24HR.UT.BRep2 | isolate_02 | 24HR | UT | BRep2 | -0.831103 | 1.055918 | -3.294669 | -6.621351 | -0.365654 | ... | -0.215727 | -0.105166 | 0.295103 | 0.566553 | -1.289015 | 1.008094 | 0.539046 | -0.099084 | 0.611115 | 1.697 |
14 | isolate_02.24HR.UT.BRep3 | isolate_02 | 24HR | UT | BRep3 | -1.067333 | 0.453303 | -2.675612 | -6.288987 | -0.541833 | ... | -1.013739 | 0.328516 | -0.465013 | 0.451698 | -1.176785 | 0.396625 | -0.200786 | 0.523071 | -0.903835 | 1.697 |
15 | isolate_02.24HR.UT.BRep4 | isolate_02 | 24HR | UT | BRep4 | -0.839616 | 0.446568 | -2.450340 | -5.608584 | -0.604042 | ... | -0.957355 | 0.489947 | -1.122378 | 0.607087 | -0.741156 | 0.512648 | -0.257656 | 0.610839 | -0.684762 | 1.697 |
16 | isolate_02.6HR.DHA.BRep1 | isolate_02 | 6HR | DHA | BRep1 | -0.792754 | 0.958172 | -0.852579 | -5.006698 | -0.828517 | ... | -1.377291 | 0.159294 | -0.532192 | 1.358049 | -2.365829 | 0.281288 | -0.214322 | -0.123179 | -0.640692 | 1.697 |
17 | isolate_02.6HR.DHA.BRep2 | isolate_02 | 6HR | DHA | BRep2 | -1.046670 | 0.098186 | -1.771210 | -5.853528 | -0.305286 | ... | -0.837113 | 0.295548 | -0.113224 | 0.320856 | -0.736279 | -0.230788 | -0.082347 | 0.430137 | -0.242890 | 1.697 |
18 | isolate_02.6HR.DHA.BRep3 | isolate_02 | 6HR | DHA | BRep3 | -1.588809 | 0.739118 | -2.904774 | -5.202351 | 1.385476 | ... | 0.199116 | -0.477491 | 0.665402 | 0.022030 | -0.120614 | 0.942747 | 0.473917 | -0.051604 | 0.569513 | 1.697 |
19 | isolate_02.6HR.DHA.BRep4 | isolate_02 | 6HR | DHA | BRep4 | -0.680281 | 1.448573 | -2.357755 | -4.116196 | -1.124109 | ... | -0.920867 | 0.065383 | -1.071213 | 1.024844 | -1.541809 | 0.218041 | -0.326610 | 0.665595 | 0.324805 | 1.697 |
20 | isolate_02.6HR.UT.BRep1 | isolate_02 | 6HR | UT | BRep1 | -1.085373 | 0.531026 | -2.309487 | -4.641513 | 0.168090 | ... | -0.943166 | 0.290954 | -0.582360 | 0.267773 | -1.455443 | 0.461292 | -0.242356 | 0.449962 | -0.752169 | 1.697 |
21 | isolate_02.6HR.UT.BRep2 | isolate_02 | 6HR | UT | BRep2 | -0.834661 | 0.536283 | -2.612337 | -3.851328 | 0.428547 | ... | -1.244496 | 0.659073 | 0.976006 | 0.427998 | -1.899695 | 0.343565 | -0.656701 | 0.487371 | -0.918014 | 1.697 |
22 | isolate_02.6HR.UT.BRep3 | isolate_02 | 6HR | UT | BRep3 | -1.360318 | 0.809453 | -2.452710 | -6.027356 | -1.344992 | ... | -0.509423 | -0.115217 | -0.116530 | 0.561582 | -2.109954 | 0.945989 | -0.108039 | 0.227384 | -0.302962 | 1.697 |
23 | isolate_02.6HR.UT.BRep4 | isolate_02 | 6HR | UT | BRep4 | -0.480521 | 0.341766 | -0.738533 | -5.046426 | -0.950383 | ... | -1.351784 | 0.264851 | -0.608725 | 1.199647 | -1.639487 | 0.502498 | -0.400176 | -0.031891 | -0.520738 | 1.697 |
24 | isolate_03.24HR.DHA.BRep1 | isolate_03 | 24HR | DHA | BRep1 | -1.199577 | 0.598246 | -1.248158 | -5.855809 | -0.580605 | ... | -1.247383 | 0.258670 | -0.354541 | 1.161986 | -2.014706 | 0.339105 | -0.576505 | 0.500075 | -0.154729 | 1.799 |
25 | isolate_03.24HR.DHA.BRep2 | isolate_03 | 24HR | DHA | BRep2 | -1.460192 | 1.927354 | -3.237152 | -5.949091 | 0.449967 | ... | -0.225111 | -0.674256 | -0.515577 | 0.555073 | -0.781383 | 0.709991 | 0.203399 | -0.297557 | 1.779288 | 1.799 |
26 | isolate_03.24HR.UT.BRep1 | isolate_03 | 24HR | UT | BRep1 | -1.257227 | 0.447835 | -1.850317 | -6.961826 | -0.975635 | ... | -1.524692 | 1.023201 | -0.745054 | 0.757417 | -2.138532 | 0.075439 | -0.825895 | 1.081647 | -1.062411 | 1.799 |
27 | isolate_03.24HR.UT.BRep2 | isolate_03 | 24HR | UT | BRep2 | -0.698684 | 0.895778 | -3.177363 | -2.526073 | -0.319774 | ... | -0.693305 | -0.389737 | -0.736943 | 0.274204 | -2.081892 | 0.384307 | -0.071042 | 0.133207 | 1.144208 | 1.799 |
28 | isolate_03.6HR.DHA.BRep1 | isolate_03 | 6HR | DHA | BRep1 | -0.966405 | 0.730976 | -2.410188 | -2.892526 | -0.613145 | ... | -1.061740 | 0.239158 | -1.441044 | 1.338106 | -1.655025 | 0.208669 | -0.087373 | 0.049857 | -0.479941 | 1.799 |
29 | isolate_03.6HR.DHA.BRep2 | isolate_03 | 6HR | DHA | BRep2 | -1.468334 | 0.763875 | -3.365404 | -5.396825 | 1.169199 | ... | 0.448757 | -0.539882 | -0.241445 | -0.080214 | -2.150805 | 1.185146 | 0.641634 | -0.185741 | 0.556788 | 1.799 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
242 | isolate_27.6HR.UT.BRep3 | isolate_27 | 6HR | UT | BRep3 | -1.479733 | 0.873729 | -2.995090 | -5.534704 | 0.788175 | ... | -0.390459 | 0.098318 | -0.822239 | -0.058461 | -1.467940 | 0.686608 | 0.178390 | 0.217990 | -0.080514 | 1.348 |
243 | isolate_27.6HR.UT.BRep4 | isolate_27 | 6HR | UT | BRep4 | 0.255800 | 0.743422 | 0.120315 | -4.791222 | -1.000942 | ... | -1.588499 | 0.668209 | -0.494822 | 0.787637 | -1.486726 | 0.529615 | -0.708677 | 0.979776 | -1.327158 | 1.348 |
244 | isolate_27.6HR.UT.BRep5 | isolate_27 | 6HR | UT | BRep5 | -0.850431 | 1.315075 | -2.330418 | -3.352280 | -2.052225 | ... | -1.005504 | -0.035619 | 0.314385 | 0.773208 | -2.199026 | 0.135653 | -0.308070 | 0.372287 | -0.060587 | 1.348 |
245 | isolate_27.6HR.UT.BRep6 | isolate_27 | 6HR | UT | BRep6 | 0.101324 | 1.137361 | -1.836046 | -6.264413 | -1.192355 | ... | -1.003004 | 0.180599 | -0.679150 | 1.154996 | -1.423278 | 0.444403 | -0.147474 | 0.396718 | -0.379247 | 1.348 |
246 | isolate_27.6HR.UT.BRep7 | isolate_27 | 6HR | UT | BRep7 | -1.592814 | 0.733240 | -3.359684 | -6.359498 | -0.972636 | ... | -1.208499 | 0.014496 | -1.149130 | 0.525342 | -1.517542 | 0.384216 | 0.068098 | 0.659580 | 0.714603 | 1.348 |
247 | isolate_27.6HR.UT.BRep8 | isolate_27 | 6HR | UT | BRep8 | -0.585143 | 0.345710 | -2.985263 | -3.598660 | -0.677759 | ... | -0.874534 | 0.075711 | -0.910279 | 0.736739 | -1.940898 | 0.251690 | -0.172153 | 0.425761 | -0.923786 | 1.348 |
248 | isolate_28.24HR.DHA.BRep1 | isolate_28 | 24HR | DHA | BRep1 | -0.950635 | 1.058801 | -3.279824 | -5.804150 | 0.868882 | ... | -1.059134 | 0.122699 | -0.275637 | 0.110776 | -2.114496 | 0.837468 | 0.219821 | 0.148167 | 0.081042 | 1.453 |
249 | isolate_28.24HR.DHA.BRep2 | isolate_28 | 24HR | DHA | BRep2 | -0.224792 | 0.857616 | -3.837129 | -6.450879 | 0.160659 | ... | -0.439133 | 0.133964 | -0.270707 | 0.241441 | -2.462835 | 0.231931 | 0.115247 | 0.235549 | 0.357950 | 1.453 |
250 | isolate_28.24HR.UT.BRep1 | isolate_28 | 24HR | UT | BRep1 | -1.061311 | 0.417939 | -2.945329 | -5.687589 | -1.113792 | ... | 0.005643 | 0.128742 | 0.250623 | 0.192165 | -0.877959 | 0.569689 | 0.632991 | 0.530408 | 0.044373 | 1.453 |
251 | isolate_28.24HR.UT.BRep2 | isolate_28 | 24HR | UT | BRep2 | -0.572639 | 0.510610 | -1.362488 | -2.825827 | -3.437945 | ... | -0.517880 | 0.591626 | -1.086417 | 0.708568 | -0.961700 | 0.243299 | -0.830774 | 0.962087 | -0.540670 | 1.453 |
252 | isolate_28.6HR.DHA.BRep1 | isolate_28 | 6HR | DHA | BRep1 | -0.786347 | -0.316650 | -3.093150 | -5.671040 | -1.063638 | ... | -0.562073 | 0.479376 | -0.472837 | 0.200602 | -1.436363 | 0.095742 | -0.020455 | 0.899690 | -1.051996 | 1.453 |
253 | isolate_28.6HR.DHA.BRep2 | isolate_28 | 6HR | DHA | BRep2 | -0.965268 | 1.059539 | -2.754323 | -4.151617 | -0.277674 | ... | -0.911576 | 0.235208 | -0.692463 | 1.140874 | -2.454726 | 0.139945 | -0.329656 | -0.081388 | -0.091958 | 1.453 |
254 | isolate_28.6HR.UT.BRep1 | isolate_28 | 6HR | UT | BRep1 | -0.276988 | 0.415027 | -0.479694 | -4.845695 | -1.132520 | ... | -1.352655 | 0.202506 | -0.205722 | 0.732571 | -0.978727 | -0.116941 | -0.094547 | 0.623027 | 0.122087 | 1.453 |
255 | isolate_28.6HR.UT.BRep2 | isolate_28 | 6HR | UT | BRep2 | -0.785559 | 1.026239 | -2.357299 | -4.837051 | -0.343913 | ... | -0.649602 | -0.067990 | 1.150306 | 0.421680 | -1.818289 | 0.865032 | 0.078230 | 0.173118 | 1.072409 | 1.453 |
256 | isolate_29.24HR.DHA.BRep1 | isolate_29 | 24HR | DHA | BRep1 | -1.115310 | 0.716736 | -1.403494 | -5.464998 | -0.433233 | ... | -0.726112 | 0.452986 | 0.101051 | 1.499581 | -1.864173 | -0.064026 | -0.659352 | 0.402915 | -0.410209 | 1.990 |
257 | isolate_29.24HR.DHA.BRep2 | isolate_29 | 24HR | DHA | BRep2 | -1.280405 | 1.129781 | -1.837170 | -5.051021 | -0.659166 | ... | -0.690904 | 0.031148 | 0.029593 | 1.368209 | -2.201507 | 0.269939 | -0.114870 | -0.245257 | 0.046548 | 1.990 |
258 | isolate_29.24HR.UT.BRep1 | isolate_29 | 24HR | UT | BRep1 | -0.654726 | 0.567147 | -0.930625 | -5.414064 | -1.018953 | ... | -1.268580 | 0.025631 | -0.311755 | 1.060053 | -1.700597 | 0.287550 | 0.141410 | -0.006351 | -0.076215 | 1.990 |
259 | isolate_29.24HR.UT.BRep2 | isolate_29 | 24HR | UT | BRep2 | -0.563394 | 0.439543 | -1.676543 | -3.170102 | -3.343840 | ... | -0.813006 | -0.137299 | 0.273063 | 0.565321 | -0.821248 | 0.243620 | 0.015184 | 0.388667 | -0.222223 | 1.990 |
260 | isolate_29.6HR.DHA.BRep1 | isolate_29 | 6HR | DHA | BRep1 | -1.554723 | 0.730785 | -3.438687 | -5.447302 | -0.314840 | ... | -1.267192 | 0.186690 | -1.746578 | 1.520663 | -2.636001 | 0.754605 | -0.475425 | 0.248404 | -0.613732 | 1.990 |
261 | isolate_29.6HR.DHA.BRep2 | isolate_29 | 6HR | DHA | BRep2 | -0.636178 | 0.172809 | -2.696966 | -5.626919 | 0.109564 | ... | -0.883134 | 0.582260 | -0.300453 | 0.101378 | -0.786520 | 0.356458 | -0.151425 | 0.701458 | -0.536342 | 1.990 |
262 | isolate_29.6HR.UT.BRep1 | isolate_29 | 6HR | UT | BRep1 | -0.296083 | 0.423375 | -1.095481 | -6.484121 | -0.103989 | ... | -1.318517 | 1.397622 | -1.813682 | 1.143219 | -2.117421 | 0.096241 | -1.361130 | 1.634048 | 0.247918 | 1.990 |
263 | isolate_29.6HR.UT.BRep2 | isolate_29 | 6HR | UT | BRep2 | -1.813041 | 0.890928 | -2.479438 | -6.026245 | 0.827461 | ... | -0.377092 | -0.580702 | 0.131261 | -0.492634 | -0.935739 | 0.708170 | 0.247816 | 0.066255 | 0.315679 | 1.990 |
264 | isolate_30.24HR.DHA.BRep1 | isolate_30 | 24HR | DHA | BRep1 | -0.975989 | 1.164478 | -3.481287 | -6.037603 | 0.207426 | ... | -0.379305 | -0.228789 | 0.734703 | 0.248937 | -0.994332 | 0.875343 | 0.388887 | 0.076434 | 0.612181 | 1.363 |
265 | isolate_30.24HR.DHA.BRep2 | isolate_30 | 24HR | DHA | BRep2 | -0.471030 | 0.731458 | -2.543038 | -5.734210 | -2.323102 | ... | -1.619158 | 1.421581 | -1.248834 | 0.715166 | -2.319369 | 0.207114 | -1.059442 | 1.005176 | -1.289392 | 1.363 |
266 | isolate_30.24HR.UT.BRep1 | isolate_30 | 24HR | UT | BRep1 | 0.042976 | 0.558593 | -1.663835 | -2.616689 | -3.664556 | ... | -1.335481 | 1.120739 | -0.864432 | 1.163530 | -2.652660 | 0.296817 | -0.441023 | 0.831882 | -0.964654 | 1.363 |
267 | isolate_30.24HR.UT.BRep2 | isolate_30 | 24HR | UT | BRep2 | -0.837299 | 0.317409 | -2.262827 | -5.933248 | -1.007488 | ... | -1.200081 | 1.336456 | -1.286053 | 0.803294 | -2.141285 | -0.001567 | -1.079483 | 0.940947 | -1.248146 | 1.363 |
268 | isolate_30.6HR.DHA.BRep1 | isolate_30 | 6HR | DHA | BRep1 | -0.547496 | 1.278242 | -2.953364 | -4.714413 | -0.300387 | ... | -0.621518 | 1.165279 | 0.231229 | 0.605580 | -2.454108 | 1.003054 | -0.588209 | 0.057602 | -0.170505 | 1.363 |
269 | isolate_30.6HR.DHA.BRep2 | isolate_30 | 6HR | DHA | BRep2 | -1.068673 | 0.473819 | -3.041698 | -3.134905 | -0.066007 | ... | 0.042170 | 0.310388 | 0.486209 | 0.325325 | -0.454830 | 0.222535 | 0.447306 | 0.425515 | 0.315055 | 1.363 |
270 | isolate_30.6HR.UT.BRep1 | isolate_30 | 6HR | UT | BRep1 | -1.130075 | 0.845345 | -2.759546 | -5.439546 | 1.166847 | ... | -0.809479 | 0.309938 | -0.378315 | 0.090742 | -1.698631 | 0.772424 | -0.495931 | 0.319449 | -1.074312 | 1.363 |
271 | isolate_30.6HR.UT.BRep2 | isolate_30 | 6HR | UT | BRep2 | -1.593430 | 0.857925 | -3.017005 | -6.795783 | 0.233365 | ... | -0.523162 | 0.002438 | -0.258810 | 0.178586 | -1.783412 | 0.647358 | 0.556913 | 0.159119 | 0.983065 | 1.363 |
272 rows × 40 columns
数据处理
train_data.drop(['Sample_Name'], inplace=True, axis = 1)
train_data = train_data[[x in ['BRep1', 'BRep2'] for x in train_data['BioRep']]]
temp = list(train_data.Timepoint)
map1 = {'24HR': 0, '6HR': 1}
train_data.Timepoint = [map1[x] for x in temp]
temp = list(train_data.Treatment)
map1 = {'DHA': 0, 'UT': 1}
train_data.Treatment = [map1[x] for x in temp]
temp = list(train_data.BioRep)
map1 = {'BRep1': 0, 'BRep2': 1}
train_data.BioRep = [map1[x] for x in temp]
train_data.head()
Isolate | Timepoint | Treatment | BioRep | MAL1.59416.59687....kr...can | PF3D7_0108200 | PF3D7_0115400 | PF3D7_0201000 | PF3D7_0201500 | PF3D7_0201600 | ... | PF3D7_1247200 | PF3D7_1321000 | PF3D7_1332500 | PF3D7_1337700 | PF3D7_1401000 | PF3D7_1431300 | PF3D7_1445900 | PF3D7_1452200 | PF3D7_1456900 | DHA_IC50 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | isolate_01 | 0 | 0 | 0 | -0.443936 | -0.100675 | -2.709971 | -3.213864 | -1.091086 | -1.111358 | ... | -0.985147 | 0.597583 | -1.155107 | 0.566342 | -3.282937 | -0.828819 | -0.988564 | 1.273673 | -1.168364 | 2.177 |
1 | isolate_01 | 0 | 0 | 1 | -1.186611 | 0.735564 | -2.472694 | -3.924790 | -0.645725 | -0.681182 | ... | -0.855834 | 0.246352 | -2.062505 | 0.728823 | -2.569909 | 0.395816 | -0.221650 | 0.342256 | 0.356901 | 2.177 |
2 | isolate_01 | 0 | 1 | 0 | -0.538092 | 1.220332 | -1.666902 | -5.039397 | 1.507344 | 1.117250 | ... | -0.255342 | -0.056962 | -0.741269 | -0.357539 | -1.195457 | 0.443488 | 0.627498 | 0.386491 | 1.687583 | 2.177 |
3 | isolate_01 | 0 | 1 | 1 | -0.828352 | 0.575415 | -2.744366 | -5.905546 | -0.568281 | -0.055037 | ... | -0.558252 | 0.201290 | -0.252612 | 1.202704 | -1.468765 | 0.700349 | 0.024436 | -0.014673 | 0.206773 | 2.177 |
4 | isolate_01 | 1 | 0 | 0 | -1.481881 | 0.283878 | -2.322445 | -5.374090 | 0.319846 | -0.059816 | ... | -0.422696 | -0.244172 | -0.013380 | 0.468331 | -1.030617 | 0.364250 | -0.580314 | 0.564132 | -0.431109 | 2.177 |
5 rows × 39 columns
随机森林搜索超参数
from sklearn.model_selection import train_test_split
train,valid = train_test_split(train_data,test_size = 30,stratify=train_data['Isolate'])
train_dha = train.DHA_IC50
valid_dha = valid.DHA_IC50
train_feature = train.drop(['DHA_IC50', 'Isolate'], axis = 1)
valid_feature = valid.drop(['DHA_IC50', 'Isolate'], axis = 1)
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=1, oob_score=True,n_estimators=200)
import numpy as np
param_grid = {"max_features":list(np.arange(0.66, 0.97, 0.03)),
"min_samples_split":list(np.arange(5, 50, 2)),
"min_samples_leaf":list(np.arange(1, 8, 1)),
"max_depth":list(np.arange(4, 10, 1))
}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(rf,param_grid,cv=5)
grid_search.fit(train_feature,train_dha)
print("Best parameters:{}".format(grid_search.best_params_))
Best parameters:{'max_depth': 9, 'max_features': 0.8100000000000002, 'min_samples_leaf': 6, 'min_samples_split': 19}
线性回归
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
sc.fit(train_feature) # 估算每个特征的平均值和标准差
ttrain_feature = sc.transform(train_feature)
# 注意:这里要用同样的参数来标准化测试集保证一致性
vvalid_feature = sc.transform(valid_feature)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(ttrain_feature, train_dha)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
evalution(train_dha, lr.predict(ttrain_feature))
evalution(valid_dha, lr.predict(vvalid_feature))
相关系数为: 0.4272672215972652 置信度为: 99.99999999003602 %
相关系数为: 0.06696329254727475 置信度为: 27.48492169435901 %
对结果进行评估
from scipy.stats import spearmanr
def evalution(x, y):
coef, p = spearmanr(x, y)
print('相关系数为:', coef, " 置信度为:", (1-p)*100, "%")
evalution(train_dha, grid_search.predict(train_feature))
evalution(valid_dha, grid_search.predict(valid_feature))
相关系数为: 0.8908044428995714 置信度为: 100.0 %
相关系数为: 0.25428253615127916 置信度为: 82.4899021040629 %
rf = RandomForestRegressor(max_depth= 9, max_features = 0.8100000000000002, min_samples_leaf = 6, min_samples_split = 19, random_state=1, oob_score=True,n_estimators=300000)
rf.fit(train_feature, train_dha)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=9,
max_features=0.8100000000000002, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=6, min_samples_split=19,
min_weight_fraction_leaf=0.0, n_estimators=300000, n_jobs=None,
oob_score=True, random_state=1, verbose=0, warm_start=False)
evalution(train_dha, rf.predict(train_feature))
evalution(valid_dha, rf.predict(valid_feature))
rf.get_params
相关系数为: 0.8959320670290216 置信度为: 100.0 %
相关系数为: 0.3027808676307008 置信度为: 89.61282920162074 %
<bound method BaseEstimator.get_params of RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=9,
max_features=0.8100000000000002, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=6, min_samples_split=19,
min_weight_fraction_leaf=0.0, n_estimators=300000, n_jobs=None,
oob_score=True, random_state=1, verbose=0, warm_start=False)>
Target = []
Mean = []
Median = []
Min = []
Max = []
name = ""
for i in range(1, 31):
if i < 10:
name = "isolate_0"+str(i)
else:
name = "isolate_"+str(i)
temp = train_data[train_data['Isolate'] == name]
target = temp.DHA_IC50
feature = temp.drop(['DHA_IC50', 'Isolate'], axis = 1)
predicts = rf.predict(feature)
Target.append(target.mean())
Mean.append(predicts.mean())
name = ""
for i in range(1, 31):
if i < 10:
name = "isolate_0"+str(i)
else:
name = "isolate_"+str(i)
temp = train_data[train_data['Isolate'] == name]
target = temp.DHA_IC50
feature = temp.drop(['DHA_IC50', 'Isolate'], axis = 1)
predicts = rf.predict(feature)
Median.append(np.median(predicts))
name = ""
for i in range(1, 31):
if i < 10:
name = "isolate_0"+str(i)
else:
name = "isolate_"+str(i)
temp = train_data[train_data['Isolate'] == name]
target = temp.DHA_IC50
feature = temp.drop(['DHA_IC50', 'Isolate'], axis = 1)
predicts = rf.predict(feature)
Max.append(np.max(predicts))
name = ""
for i in range(1, 31):
if i < 10:
name = "isolate_0"+str(i)
else:
name = "isolate_"+str(i)
temp = train_data[train_data['Isolate'] == name]
target = temp.DHA_IC50
feature = temp.drop(['DHA_IC50', 'Isolate'], axis = 1)
predicts = rf.predict(feature)
Min.append(np.min(predicts))
from sklearn.externals import joblib
joblib.dump(rf, 'rf.pkl')
['rf.pkl']
import itertools
Target = list(itertools.chain.from_iterable(itertools.repeat(x, 8) for x in Target))
Mean = list(itertools.chain.from_iterable(itertools.repeat(x, 8) for x in Mean))
Median = list(itertools.chain.from_iterable(itertools.repeat(x, 8) for x in Median))
Max = list(itertools.chain.from_iterable(itertools.repeat(x, 8) for x in Max))
Min = list(itertools.chain.from_iterable(itertools.repeat(x, 8) for x in Min))
predicts = rf.predict(train_data.drop(['DHA_IC50', 'Isolate'], axis = 1))
evalution(Target, predicts)
evalution(Target, Mean)
evalution(Target, Median)
evalution(Target, Max)
evalution(Target, Min)
相关系数为: 0.8303088922694587 置信度为: 100.0 %
相关系数为: 0.9777530589543939 置信度为: 100.0 %
相关系数为: 0.974193548387097 置信度为: 100.0 %
相关系数为: 0.8327030033370413 置信度为: 100.0 %
相关系数为: 0.8909899888765297 置信度为: 100.0 %
获得计算结果
test_data.drop(dropnames, inplace=True, axis = 1)
test_data.drop(['Sample_Names'], inplace=True, axis = 1)
temp = list(test_data.Timepoint)
map1 = {'24HR': 0, '6HR': 1}
test_data.Timepoint = [map1[x] for x in temp]
temp = list(test_data.Treatment)
map1 = {'DHA': 0, 'UT': 1}
test_data.Treatment = [map1[x] for x in temp]
temp = list(test_data.BioRep)
map1 = {'BRep1': 0, 'BRep2': 1}
test_data.BioRep = [map1[x] for x in temp]
Names = []
MMeans = []
for i in range(31, 56):
name = "isolate_"+str(i)
temp = test_data[test_data['Isolate'] == name]
feature = temp.drop(['DHA_IC50', 'Isolate'], axis = 1)
predicts = rf.predict(feature)
Names.append(name)
MMeans.append(predicts.mean())
dataframe = pd.DataFrame({'Isolate': list(Names),'Predicted_IC50':MMeans})
dataframe.to_csv("SubCh1_Submission.txt",sep='\t', index=False)
推荐阅读