pandas的合并、连接、去重、替换

程序员文章站 2022-06-24 11:46:26

1 import pandas as pd 2 import numpy as np 3 4 # merge合并，类似于Excel中的vlookup 5 6 df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 7 'A': ['A0', 'A1... ......

  1 import pandas as pd
  2 import numpy as np
  3 
  4 # merge合并 ，类似于excel中的vlookup
  5 
  6 df1 = pd.dataframe({'key': ['k0', 'k1', 'k2', 'k3'],
  7                     'a': ['a0', 'a1', 'a2', 'a3'],
  8                     'b': ['b0', 'b1', 'b2', 'b3']})
  9 df2 = pd.dataframe({'key': ['k0', 'k1', 'k2', 'k3'],
 10                     'c': ['c0', 'c1', 'c2', 'c3'],
 11                     'd': ['d0', 'd1', 'd2', 'd3']})
 12 df3 = pd.dataframe({'key1': ['k0', 'k0', 'k2', 'k3'],
 13                     'key2': ['k0', 'k1', 'k0', 'k1'],
 14                     'a': ['a0', 'a1', 'a2', 'a3'],
 15                     'b': ['b0', 'b1', 'b2', 'b3']})
 16 df4 = pd.dataframe({'key1': ['k0', 'k0', 'k2', 'k3'],
 17                     'key2': ['k0', 'k0', 'k0', 'k0'],
 18                     'c': ['c0', 'c1', 'c2', 'c3'],
 19                     'd': ['d0', 'd1', 'd2', 'd3']})
 20 print(pd.merge(df1,df2,on='key'))
 21 # 第一个dataframe为拼接后左边的
 22 # 第二个dataframe为拼接后右边的
 23 # on 为参考键
 24 '''
 25   key   a   b   c   d
 26 0  k0  a0  b0  c0  d0
 27 1  k1  a1  b1  c1  d1
 28 2  k2  a2  b2  c2  d2
 29 3  k3  a3  b3  c3  d3
 30 '''
 31 # 多个键连接
 32 print(pd.merge(df3, df4, on=['key1', 'key2']))
 33 # 当两个dataframe中的key1和key2都相同时，才会连，否则不连
 34 '''
 35   key1 key2   a   b   c   d
 36 0   k0   k0  a0  b0  c0  d0
 37 1   k0   k0  a0  b0  c1  d1
 38 2   k2   k0  a2  b2  c2  d2
 39 '''
 40 # 参数how  ， 合并方式
 41 # 默认，取交集
 42 print(pd.merge(df3, df4, on=['key1', 'key2'], how='inner'))
 43 print('-' * 8)
 44 '''
 45   key1 key2   a   b   c   d
 46 0   k0   k0  a0  b0  c0  d0
 47 1   k0   k0  a0  b0  c1  d1
 48 2   k2   k0  a2  b2  c2  d2
 49 --------
 50 '''
 51 # 取并集，outer，数据缺失范围nan
 52 print(pd.merge(df3, df4, on=['key1', 'key2'], how='outer'))
 53 print('-' * 8)
 54 '''
 55   key1 key2    a    b    c    d
 56 0   k0   k0   a0   b0   c0   d0
 57 1   k0   k0   a0   b0   c1   d1
 58 2   k0   k1   a1   b1  nan  nan
 59 3   k2   k0   a2   b2   c2   d2
 60 4   k3   k1   a3   b3  nan  nan
 61 5   k3   k0  nan  nan   c3   d3
 62 --------
 63 '''
 64 # 参照df3为参考合并，数据缺失范围nan
 65 print(pd.merge(df3, df4, on=['key1', 'key2'], how='left'))
 66 print('-' * 8)
 67 '''
 68   key1 key2   a   b    c    d
 69 0   k0   k0  a0  b0   c0   d0
 70 1   k0   k0  a0  b0   c1   d1
 71 2   k0   k1  a1  b1  nan  nan
 72 3   k2   k0  a2  b2   c2   d2
 73 4   k3   k1  a3  b3  nan  nan
 74 --------
 75 '''
 76 # 参照df4为参考合并，数据缺失范围nan
 77 print(pd.merge(df3, df4, on=['key1', 'key2'], how='right'))
 78 print('-' * 8)
 79 '''
 80   key1 key2    a    b   c   d
 81 0   k0   k0   a0   b0  c0  d0
 82 1   k0   k0   a0   b0  c1  d1
 83 2   k2   k0   a2   b2  c2  d2
 84 3   k3   k0  nan  nan  c3  d3
 85 --------
 86 '''
 87 # 参数left_on，right_on，left_index, right_index  ,当键不为一个列时，可以单独设置左键与右键
 88 df5 = pd.dataframe({'lkey': list('bbacaab'),
 89                     'data1': range(7)})
 90 df6 = pd.dataframe({'rkey': list('abd'),
 91                     'date2': range(3)})
 92 print(df5)
 93 print(df6)
 94 print(pd.merge(df5,df6,left_on='lkey',right_on='rkey'))
 95 '''
 96   lkey  data1
 97 0    b      0
 98 1    b      1
 99 2    a      2
100 3    c      3
101 4    a      4
102 5    a      5
103 6    b      6
104   rkey  date2
105 0    a      0
106 1    b      1
107 2    d      2
108   lkey  data1 rkey  date2
109 0    b      0    b      1
110 1    b      1    b      1
111 2    b      6    b      1
112 3    a      2    a      0
113 4    a      4    a      0
114 5    a      5    a      0
115 '''
116 
117 # concat() 连接，默认axis=0  行+行，当axis=1时，列+列  成为dataframe
118 s1 = pd.series([2, 3, 4])
119 s2 = pd.series([1, 2, 3])
120 print(pd.concat([s1, s2]))
121 '''
122 0    2
123 1    3
124 2    4
125 0    1
126 1    2
127 2    3
128 dtype: int64
129 '''
130 print(pd.concat([s1,s2],axis=1))
131 '''
132    0  1
133 0  2  1
134 1  3  2
135 2  4  3
136 '''
137 snew = pd.concat([s1, s2], axis=1)
138 snew.reset_index(inplace=true)
139 print(snew)
140 '''
141    index  0  1
142 0      0  2  1
143 1      1  3  2
144 2      2  4  3
145 '''
146 snew2 = pd.concat([s1, s2], axis=1)
147 snew2.reset_index(inplace=true, drop=true)
148 print(snew2)
149 '''
150    0  1
151 0  2  1
152 1  3  2
153 2  4  3
154 '''
155 
156 # 去重  .duplicated()
157 s3 = pd.series([1, 2, 2, 4, 4, 6, 7, 6, 87])
158 # 判断是否重复
159 print(s3.duplicated())
160 '''
161 0    false
162 1    false
163 2     true
164 3    false
165 4     true
166 5    false
167 6    false
168 7     true
169 8    false
170 dtype: bool
171 '''
172 # 取出重复的值
173 s4 = s3[s3.duplicated()]
174 print(s4)
175 # 取出唯一的元素
176 s5 = s3[s3.duplicated() == false]
177 print(s5)
178 '''
179 0     1
180 1     2
181 3     4
182 5     6
183 6     7
184 8    87
185 dtype: int64
186 '''
187 s5 = s3.drop_duplicates()
188 # 可以通过设置参数：inplace控制是否替换原先的值
189 print(s5)
190 '''
191 0     1
192 1     2
193 3     4
194 5     6
195 6     7
196 8    87
197 dtype: int64
198 '''
199 df7 = pd.dataframe({'key1':['a','a',3,4,3],
200                     'key2':['a','a','b','b',5]})
201 print(df7.duplicated())
202 # 按行检测，第二次出现时，返回true
203 '''
204 0     1
205 1     2
206 3     4
207 5     6
208 6     7
209 8    87
210 dtype: int64
211 '''
212 # 今查看key2列
213 print(df7['key2'].duplicated())
214 '''
215 0    false
216 1     true
217 2    false
218 3     true
219 4    false
220 name: key2, dtype: bool
221 '''
222 # 直接去重
223 print(df7.drop_duplicates())
224 '''
225   key1 key2
226 0    a    a
227 2    3    b
228 3    4    b
229 4    3    5
230 '''
231 print(df7['key2'].drop_duplicates())
232 '''
233 0    a
234 2    b
235 4    5
236 name: key2, dtype: object
237 '''
238 
239 # 替换  .replace()
240 s6 = pd.series(list('askjdghs'))
241 # 一次性替换一个值
242 # print(s6.replace('s','dsd'))
243 '''
244 0      a
245 1    dsd
246 2      k
247 3      j
248 4      d
249 5      g
250 6      h
251 7    dsd
252 dtype: object
253 '''
254 # 一次性替换多个值
255 print(s6.replace(['a','s'],np.nan))
256 '''
257 0    nan
258 1    nan
259 2      k
260 3      j
261 4      d
262 5      g
263 6      h
264 7    nan
265 dtype: object
266 '''
267 # 通过字典的形式替换值
268 print(s6.replace({'a':np.nan}))
269 '''
270 0    nan
271 1      s
272 2      k
273 3      j
274 4      d
275 5      g
276 6      h
277 7      s
278 dtype: object
279 
280 '''

上一篇：荐超强的入门文章！！！ JavaScript 速成计划第一篇（基础语法详解）!

下一篇： Java的八种基本数据类型及其大小：

pandas的合并、连接、去重、替换

MySQL数据表合并去重的简单实现方法

Pandas 同元素多列去重的实例

pandas表连接索引上的合并方法

pandas 实现将重复表格去重,并重新转换为表格的方法

在Pandas中DataFrame数据合并,连接(concat,merge,join)的实例

MySQL数据表合并去重的简单实现方法

JS实现的合并多个数组去重算法示例

Python将EXCEL表格中的多个sheet中的一列提取合并，并去重后统计记录的个数

js数组去重数组拼接替换数组中的指定值递归数组判断数组中是否存在指定值数组求和根据条件判数组值

Pandas 同元素多列去重的实例

pandas的合并、连接、去重、替换

MySQL数据表合并去重的简单实现方法

Pandas 同元素多列去重的实例

pandas表连接 索引上的合并方法

pandas 实现将重复表格去重,并重新转换为表格的方法

在Pandas中DataFrame数据合并,连接(concat,merge,join)的实例

MySQL数据表合并去重的简单实现方法

JS实现的合并多个数组去重算法示例

Python将EXCEL表格中的多个sheet中的一列提取合并，并去重后统计记录的个数

js数组去重 数组拼接 替换数组中的指定值 递归数组 判断数组中是否存在指定值 数组求和 根据条件判数组值

Pandas 同元素多列去重的实例

pandas表连接索引上的合并方法

js数组去重数组拼接替换数组中的指定值递归数组判断数组中是否存在指定值数组求和根据条件判数组值