pandas中的数值计算及统计基础
程序员文章站
2022-07-09 20:12:03
1 import pandas as pd 2 import numpy as np 3 4 df = pd.DataFrame({ 5 'key1': [4, 5, 3, np.nan, 2], 6 'key2': [1, 2, np.nan, 4, 5], 7 'key3': [1, 2, 3,... ......
1 import pandas as pd 2 import numpy as np 3 4 df = pd.dataframe({ 5 'key1': [4, 5, 3, np.nan, 2], 6 'key2': [1, 2, np.nan, 4, 5], 7 'key3': [1, 2, 3, 'j', 'k'] 8 }, index=['a', 'b', 'c', 'd', 'e']) 9 print(df) 10 print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype) 11 print('-------') 12 ''' 13 key1 key2 key3 14 a 4.0 1.0 1 15 b 5.0 2.0 2 16 c 3.0 nan 3 17 d nan 4.0 j 18 e 2.0 5.0 k 19 float64 float64 object 20 ------- 21 ''' 22 # 计算每一列的均值 df.mean() 23 # 只统计数字列,默认忽略nan。 24 print(df.mean()) 25 ''' 26 key1 3.5 27 key2 3.0 28 dtype: float64 29 ''' 30 # 不忽略nan值计算均值 31 # skipna默认为true,如果为false,有nan的列统计结果仍为nan 32 m3 = df.mean(skipna=false) 33 print(m3) 34 ''' 35 key1 nan 36 key2 nan 37 dtype: float64 38 ''' 39 # 计算单一列的均值 40 print('计算单一列的均值',df['key2'].mean()) 41 ''' 42 计算单一列的均值 3.0 43 ''' 44 45 df2 = pd.dataframe({ 46 'key1': [1, 3, 5], 47 'key2': [2, 4, 6], 48 'key3': [3, 5, 7] 49 }, index=['a', 'b', 'c']) 50 # print(df2) 51 # print('--------df2') 52 # 计算df2每一行的均值并将其结果添加到新的列 53 df2['mean'] = df2.mean(axis=1) 54 print(df2) 55 ''' 56 key1 key2 key3 mean 57 a 1 2 3 2.0 58 b 3 4 5 4.0 59 c 5 6 7 6.0 60 ''' 61 62 # 统计非nan值的数量 count() 63 print(df) 64 print('-'*6) 65 print(df.count()) 66 ''' 67 key1 key2 key3 68 a 4.0 1.0 1 69 b 5.0 2.0 2 70 c 3.0 nan 3 71 d nan 4.0 j 72 e 2.0 5.0 k 73 ------ 74 key1 4 75 key2 4 76 key3 5 77 dtype: int64 78 ''' 79 80 # 统计 81 print(df) 82 print('-' * 6) 83 print('df的最小值',df.min()) 84 print('df的最大值',df.max()) 85 print('df的key2列的最大值',df['key2'].max()) 86 print('统计df的分位数,参数q确定位置',df.quantile(q=0.75)) 87 print('对df求和',df.sum()) 88 print('求df的中位数,median(),50%分位数',df.median()) 89 print('求df的标准差,std()',df.std()) 90 print('求df的方差,var()',df.var()) 91 print('求skew样本的偏度,skew()',df.skew()) 92 print('求kurt样本的峰度,kurt()',df.kurt()) 93 print('df累计求和,cumsum()',df['key2'].cumsum()) 94 print('df累计求积,cumprod()',df['key2'].cumprod()) 95 print('求df的累计最大值,cummax()', df['key2'].cummax()) 96 print('求df的累计最小值,cummin()', df['key2'].cummin()) 97 ''' 98 key1 key2 key3 99 a 4.0 1.0 1 100 b 5.0 2.0 2 101 c 3.0 nan 3 102 d nan 4.0 j 103 e 2.0 5.0 k 104 ------ 105 df的最小值 key1 2.0 106 key2 1.0 107 dtype: float64 108 df的最大值 key1 5.0 109 key2 5.0 110 dtype: float64 111 df的key2列的最大值 5.0 112 统计df的分位数,参数q确定位置 key1 4.25 113 key2 4.25 114 name: 0.75, dtype: float64 115 对df求和 key1 14.0 116 key2 12.0 117 dtype: float64 118 求df的中位数,median(),50%分位数 key1 3.5 119 key2 3.0 120 dtype: float64 121 求df的标准差,std() key1 1.290994 122 key2 1.825742 123 dtype: float64 124 求df的方差,var() key1 1.666667 125 key2 3.333333 126 dtype: float64 127 求skew样本的偏度,skew() key1 0.0 128 key2 0.0 129 dtype: float64 130 求kurt样本的峰度,kurt() key1 -1.2 131 key2 -3.3 132 dtype: float64 133 df累计求和,cumsum() a 1.0 134 b 3.0 135 c nan 136 d 7.0 137 e 12.0 138 name: key2, dtype: float64 139 df累计求积,cumprod() a 1.0 140 b 2.0 141 c nan 142 d 8.0 143 e 40.0 144 name: key2, dtype: float64 145 求df的累计最大值,cummax() a 1.0 146 b 2.0 147 c nan 148 d 4.0 149 e 5.0 150 name: key2, dtype: float64 151 求df的累计最小值,cummin() a 1.0 152 b 1.0 153 c nan 154 d 1.0 155 e 1.0 156 name: key2, dtype: float64 157 ''' 158 159 # 唯一值 :unique() 160 s = pd.series(list('kjdhsakjdhjfh')) 161 sq = s.unique() 162 print(s) 163 print(sq) 164 print('sq的类型:',type(sq)) 165 print('对sq进行重新排序:',pd.series(sq).sort_values()) 166 ''' 167 0 k 168 1 j 169 2 d 170 3 h 171 4 s 172 5 a 173 6 k 174 7 j 175 8 d 176 9 h 177 10 j 178 11 f 179 12 h 180 dtype: object 181 ['k' 'j' 'd' 'h' 's' 'a' 'f'] 182 sq的类型: <class 'numpy.ndarray'> 183 对sq进行重新排序: 5 a 184 2 d 185 6 f 186 3 h 187 1 j 188 0 k 189 4 s 190 dtype: object 191 ''' 192 # 对某一列进行值的计数,只能对一列,不能对dataframe 193 print(df['key2'].value_counts()) 194 195 # 判断dataframe中的每个元素是否都是在某个列表中 196 print(df) 197 df_isin = df.isin([1,3]) 198 print(df_isin) 199 ''' 200 key1 key2 key3 201 a 4.0 1.0 1 202 b 5.0 2.0 2 203 c 3.0 nan 3 204 d nan 4.0 j 205 e 2.0 5.0 k 206 207 208 key1 key2 key3 209 a false true true 210 b false false false 211 c true false true 212 d false false false 213 e false false false 214 '''