pandas文本处理
程序员文章站
2022-06-04 14:02:52
1 import pandas as pd 2 import numpy as np 3 4 s = pd.Series(['A', 'b', 'c', 'bbhello', '123', np.nan, 'hj']) 5 df = pd.DataFrame({'key1': list('abcde... ......
1 import pandas as pd 2 import numpy as np 3 4 s = pd.series(['a', 'b', 'c', 'bbhello', '123', np.nan, 'hj']) 5 df = pd.dataframe({'key1': list('abcdef'), 6 'key2': ['hee', 'fv', 'w', 'hija', '123', np.nan]}) 7 print(s) 8 print('-'*8) 9 print(df) 10 print('-'*8) 11 ''' 12 0 a 13 1 b 14 2 c 15 3 bbhello 16 4 123 17 5 nan 18 6 hj 19 dtype: object 20 -------- 21 key1 key2 22 0 a hee 23 1 b fv 24 2 c w 25 3 d hija 26 4 e 123 27 5 f nan 28 -------- 29 ''' 30 # 直接通过.str调用字符串方法,可以对series、dataframe使用,自动过滤nan值 31 print(s.str.count('b')) 32 ''' 33 0 0.0 34 1 1.0 35 2 0.0 36 3 2.0 37 4 0.0 38 5 nan 39 6 0.0 40 dtype: float64 41 ''' 42 print(df['key2'].str.upper()) 43 ''' 44 0 hee 45 1 fv 46 2 w 47 3 hija 48 4 123 49 5 nan 50 name: key2, dtype: object 51 ''' 52 # 将所有的列名改为大写 53 df.columns = df.columns.str.upper() 54 print(df) 55 ''' 56 key1 key2 57 0 a hee 58 1 b fv 59 2 c w 60 3 d hija 61 4 e 123 62 5 f nan 63 ''' 64 # 字符串常用方法 --lower,upper,len,starswith,endswith 65 66 print('小写,lower()',s.str.lower()) 67 print('大写,upper()',s.str.upper()) 68 print('长度,len()',s.str.len()) 69 print('判断起始是否为b,startswith()',s.str.startswith('b')) 70 print('判断结束是否为"o",endswith()',s.str.endswith('o')) 71 ''' 72 小写,lower() 0 a 73 1 b 74 2 c 75 3 bbhello 76 4 123 77 5 nan 78 6 hj 79 dtype: object 80 大写,upper() 0 a 81 1 b 82 2 c 83 3 bbhello 84 4 123 85 5 nan 86 6 hj 87 dtype: object 88 长度,len() 0 1.0 89 1 1.0 90 2 1.0 91 3 7.0 92 4 3.0 93 5 nan 94 6 2.0 95 dtype: float64 96 判断起始是否为b,startswith() 0 false 97 1 true 98 2 false 99 3 true 100 4 false 101 5 nan 102 6 false 103 dtype: object 104 判断结束是否为"o",endswith() 0 false 105 1 false 106 2 false 107 3 true 108 4 false 109 5 nan 110 6 false 111 dtype: object 112 ''' 113 # 字符串常用方法 --strip 114 115 s2 = pd.series([' jack', 'jill ', ' jesse ']) 116 df2 = pd.dataframe(np.random.randn(3, 2), columns=[' a ', ' b'], index=range(3)) 117 print(s2) 118 print('-'*8) 119 print(df2) 120 print('-'*8) 121 ''' 122 0 jack 123 1 jill 124 2 jesse 125 dtype: object 126 -------- 127 a b 128 0 -0.333042 -0.467830 129 1 0.605179 -0.658910 130 2 -0.490881 -0.639754 131 -------- 132 ''' 133 print(s2.str.strip()) 134 print('-'*8) 135 print(s2.str.lstrip()) 136 print('-'*8) 137 print(s2.str.rstrip()) 138 ''' 139 0 jack 140 1 jill 141 2 jesse 142 dtype: object 143 -------- 144 0 jack 145 1 jill 146 2 jesse 147 dtype: object 148 -------- 149 0 jack 150 1 jill 151 2 jesse 152 dtype: object 153 ''' 154 df2.columns = df2.columns.str.strip() 155 print(df2) 156 ''' 157 a b 158 0 -0.801508 1.650113 159 1 -0.669556 -1.195999 160 2 0.277338 -0.727100 161 162 ''' 163 164 # 字符串常用方法 -- replace() 165 df3 = pd.dataframe(np.random.randn(3, 2), columns=[' a a', ' b b'], index=range(3)) 166 df3.columns = df3.columns.str.replace(' ', '-', n=2) 167 print(df3) 168 ''' 169 -a-a -b- b 170 0 -1.225938 0.296270 171 1 0.769037 2.794032 172 2 -1.686818 0.109314 173 ''' 174 # 字符串常用方法 -- spilt、rsplit 175 s4 = pd.series(['a,b,c', '1,2,3', ['a,,,c'], np.nan]) 176 print(s4) 177 print(s4.str.split(',')) 178 ''' 179 0 a,b,c 180 1 1,2,3 181 2 [a,,,c] 182 3 nan 183 dtype: object 184 0 [a, b, c] 185 1 [1, 2, 3] 186 2 nan 187 3 nan 188 dtype: object 189 ''' 190 # 直接索引得到一个list 191 # 可以使用get或[]符号访问拆散列表中的元素 192 print(s4.str.split(',').str[0]) 193 print(s4.str.split(',').str.get(0)) 194 ''' 195 0 a 196 1 1 197 2 nan 198 3 nan 199 dtype: object 200 0 a 201 1 1 202 2 nan 203 3 nan 204 dtype: object 205 ''' 206 207 # 可以使用expand可以轻松扩展此操作以返回dataframe 208 # n 参数限制分割数 209 print(s4.str.split(',')) 210 print('-' * 8) 211 print(s4.str.split(',', expand=true)) 212 ''' 213 0 [a, b, c] 214 1 [1, 2, 3] 215 2 nan 216 3 nan 217 dtype: object 218 -------- 219 0 1 2 220 0 a b c 221 1 1 2 3 222 2 nan nan nan 223 3 nan nan nan 224 ''' 225 print(s4.str.split(',', expand=true, n=1)) 226 ''' 227 0 1 228 0 a b,c 229 1 1 2,3 230 2 nan nan 231 3 nan nan 232 ''' 233 # rsplit类似于split,反向工作,即从字符串的末尾到字符串的开头 234 print(s4.str.split(',', expand=true, n=1)) 235 print('-' * 8) 236 print(s4.str.rsplit(',', expand=true, n=1)) 237 ''' 238 0 1 239 0 a b,c 240 1 1 2,3 241 2 nan nan 242 3 nan nan 243 -------- 244 0 1 245 0 a,b c 246 1 1,2 3 247 2 nan nan 248 3 nan nan 249 ''' 250 251 df4 = pd.dataframe({'key1': ['a,b,c', '1,2,3', [':,,, ']], 252 'key2': ['a-b-c', '1-2-3', [':-.- ']]}) 253 print(df4) 254 print('-'*8) 255 print(df4['key2'].str.split('-')) 256 ''' 257 key1 key2 258 0 a,b,c a-b-c 259 1 1,2,3 1-2-3 260 2 [:,,, ] [:-.- ] 261 -------- 262 0 [a, b, c] 263 1 [1, 2, 3] 264 2 nan 265 name: key2, dtype: object 266 ''' 267 # 通过索引获取分割后的元素 268 df4['k201'] = df4['key2'].str.split('-').str[0] 269 df4['k202'] = df4['key2'].str.split('-').str[1] 270 df4['k203'] = df4['key2'].str.split('-').str[2] 271 print(df4) 272 ''' 273 key1 key2 k201 k202 k203 274 0 a,b,c a-b-c a b c 275 1 1,2,3 1-2-3 1 2 3 276 2 [:,,, ] [:-.- ] nan nan nan 277 '''
下一篇: 冬季着凉易落枕 中医推荐针灸穴位治疗