pandas的合并、连接、去重、替换
程序员文章站
2022-06-24 11:46:26
1 import pandas as pd 2 import numpy as np 3 4 # merge合并 ,类似于Excel中的vlookup 5 6 df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 7 'A': ['A0', 'A1... ......
1 import pandas as pd 2 import numpy as np 3 4 # merge合并 ,类似于excel中的vlookup 5 6 df1 = pd.dataframe({'key': ['k0', 'k1', 'k2', 'k3'], 7 'a': ['a0', 'a1', 'a2', 'a3'], 8 'b': ['b0', 'b1', 'b2', 'b3']}) 9 df2 = pd.dataframe({'key': ['k0', 'k1', 'k2', 'k3'], 10 'c': ['c0', 'c1', 'c2', 'c3'], 11 'd': ['d0', 'd1', 'd2', 'd3']}) 12 df3 = pd.dataframe({'key1': ['k0', 'k0', 'k2', 'k3'], 13 'key2': ['k0', 'k1', 'k0', 'k1'], 14 'a': ['a0', 'a1', 'a2', 'a3'], 15 'b': ['b0', 'b1', 'b2', 'b3']}) 16 df4 = pd.dataframe({'key1': ['k0', 'k0', 'k2', 'k3'], 17 'key2': ['k0', 'k0', 'k0', 'k0'], 18 'c': ['c0', 'c1', 'c2', 'c3'], 19 'd': ['d0', 'd1', 'd2', 'd3']}) 20 print(pd.merge(df1,df2,on='key')) 21 # 第一个dataframe为拼接后左边的 22 # 第二个dataframe为拼接后右边的 23 # on 为参考键 24 ''' 25 key a b c d 26 0 k0 a0 b0 c0 d0 27 1 k1 a1 b1 c1 d1 28 2 k2 a2 b2 c2 d2 29 3 k3 a3 b3 c3 d3 30 ''' 31 # 多个键连接 32 print(pd.merge(df3, df4, on=['key1', 'key2'])) 33 # 当两个dataframe中的key1和key2都相同时,才会连,否则不连 34 ''' 35 key1 key2 a b c d 36 0 k0 k0 a0 b0 c0 d0 37 1 k0 k0 a0 b0 c1 d1 38 2 k2 k0 a2 b2 c2 d2 39 ''' 40 # 参数how , 合并方式 41 # 默认,取交集 42 print(pd.merge(df3, df4, on=['key1', 'key2'], how='inner')) 43 print('-' * 8) 44 ''' 45 key1 key2 a b c d 46 0 k0 k0 a0 b0 c0 d0 47 1 k0 k0 a0 b0 c1 d1 48 2 k2 k0 a2 b2 c2 d2 49 -------- 50 ''' 51 # 取并集,outer,数据缺失范围nan 52 print(pd.merge(df3, df4, on=['key1', 'key2'], how='outer')) 53 print('-' * 8) 54 ''' 55 key1 key2 a b c d 56 0 k0 k0 a0 b0 c0 d0 57 1 k0 k0 a0 b0 c1 d1 58 2 k0 k1 a1 b1 nan nan 59 3 k2 k0 a2 b2 c2 d2 60 4 k3 k1 a3 b3 nan nan 61 5 k3 k0 nan nan c3 d3 62 -------- 63 ''' 64 # 参照df3为参考合并,数据缺失范围nan 65 print(pd.merge(df3, df4, on=['key1', 'key2'], how='left')) 66 print('-' * 8) 67 ''' 68 key1 key2 a b c d 69 0 k0 k0 a0 b0 c0 d0 70 1 k0 k0 a0 b0 c1 d1 71 2 k0 k1 a1 b1 nan nan 72 3 k2 k0 a2 b2 c2 d2 73 4 k3 k1 a3 b3 nan nan 74 -------- 75 ''' 76 # 参照df4为参考合并,数据缺失范围nan 77 print(pd.merge(df3, df4, on=['key1', 'key2'], how='right')) 78 print('-' * 8) 79 ''' 80 key1 key2 a b c d 81 0 k0 k0 a0 b0 c0 d0 82 1 k0 k0 a0 b0 c1 d1 83 2 k2 k0 a2 b2 c2 d2 84 3 k3 k0 nan nan c3 d3 85 -------- 86 ''' 87 # 参数left_on,right_on,left_index, right_index ,当键不为一个列时,可以单独设置左键与右键 88 df5 = pd.dataframe({'lkey': list('bbacaab'), 89 'data1': range(7)}) 90 df6 = pd.dataframe({'rkey': list('abd'), 91 'date2': range(3)}) 92 print(df5) 93 print(df6) 94 print(pd.merge(df5,df6,left_on='lkey',right_on='rkey')) 95 ''' 96 lkey data1 97 0 b 0 98 1 b 1 99 2 a 2 100 3 c 3 101 4 a 4 102 5 a 5 103 6 b 6 104 rkey date2 105 0 a 0 106 1 b 1 107 2 d 2 108 lkey data1 rkey date2 109 0 b 0 b 1 110 1 b 1 b 1 111 2 b 6 b 1 112 3 a 2 a 0 113 4 a 4 a 0 114 5 a 5 a 0 115 ''' 116 117 # concat() 连接,默认axis=0 行+行,当axis=1时,列+列 成为dataframe 118 s1 = pd.series([2, 3, 4]) 119 s2 = pd.series([1, 2, 3]) 120 print(pd.concat([s1, s2])) 121 ''' 122 0 2 123 1 3 124 2 4 125 0 1 126 1 2 127 2 3 128 dtype: int64 129 ''' 130 print(pd.concat([s1,s2],axis=1)) 131 ''' 132 0 1 133 0 2 1 134 1 3 2 135 2 4 3 136 ''' 137 snew = pd.concat([s1, s2], axis=1) 138 snew.reset_index(inplace=true) 139 print(snew) 140 ''' 141 index 0 1 142 0 0 2 1 143 1 1 3 2 144 2 2 4 3 145 ''' 146 snew2 = pd.concat([s1, s2], axis=1) 147 snew2.reset_index(inplace=true, drop=true) 148 print(snew2) 149 ''' 150 0 1 151 0 2 1 152 1 3 2 153 2 4 3 154 ''' 155 156 # 去重 .duplicated() 157 s3 = pd.series([1, 2, 2, 4, 4, 6, 7, 6, 87]) 158 # 判断是否重复 159 print(s3.duplicated()) 160 ''' 161 0 false 162 1 false 163 2 true 164 3 false 165 4 true 166 5 false 167 6 false 168 7 true 169 8 false 170 dtype: bool 171 ''' 172 # 取出重复的值 173 s4 = s3[s3.duplicated()] 174 print(s4) 175 # 取出唯一的元素 176 s5 = s3[s3.duplicated() == false] 177 print(s5) 178 ''' 179 0 1 180 1 2 181 3 4 182 5 6 183 6 7 184 8 87 185 dtype: int64 186 ''' 187 s5 = s3.drop_duplicates() 188 # 可以通过设置参数:inplace控制是否替换原先的值 189 print(s5) 190 ''' 191 0 1 192 1 2 193 3 4 194 5 6 195 6 7 196 8 87 197 dtype: int64 198 ''' 199 df7 = pd.dataframe({'key1':['a','a',3,4,3], 200 'key2':['a','a','b','b',5]}) 201 print(df7.duplicated()) 202 # 按行检测,第二次出现时,返回true 203 ''' 204 0 1 205 1 2 206 3 4 207 5 6 208 6 7 209 8 87 210 dtype: int64 211 ''' 212 # 今查看key2列 213 print(df7['key2'].duplicated()) 214 ''' 215 0 false 216 1 true 217 2 false 218 3 true 219 4 false 220 name: key2, dtype: bool 221 ''' 222 # 直接去重 223 print(df7.drop_duplicates()) 224 ''' 225 key1 key2 226 0 a a 227 2 3 b 228 3 4 b 229 4 3 5 230 ''' 231 print(df7['key2'].drop_duplicates()) 232 ''' 233 0 a 234 2 b 235 4 5 236 name: key2, dtype: object 237 ''' 238 239 # 替换 .replace() 240 s6 = pd.series(list('askjdghs')) 241 # 一次性替换一个值 242 # print(s6.replace('s','dsd')) 243 ''' 244 0 a 245 1 dsd 246 2 k 247 3 j 248 4 d 249 5 g 250 6 h 251 7 dsd 252 dtype: object 253 ''' 254 # 一次性替换多个值 255 print(s6.replace(['a','s'],np.nan)) 256 ''' 257 0 nan 258 1 nan 259 2 k 260 3 j 261 4 d 262 5 g 263 6 h 264 7 nan 265 dtype: object 266 ''' 267 # 通过字典的形式替换值 268 print(s6.replace({'a':np.nan})) 269 ''' 270 0 nan 271 1 s 272 2 k 273 3 j 274 4 d 275 5 g 276 6 h 277 7 s 278 dtype: object 279 280 '''