统计分析进阶
程序员文章站
2024-03-22 09:11:16
...
一 数据特征分析
1.1 分布分析
研究数据的分布特征和分布类型,分定量数据、定性数据
做分布分析一般需要做三个统计量:极差/频率分布/分组组距及组数
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 读取数据
data = pd.read_csv("C:/Users/lizheying/Desktop/ziliao/深圳罗湖二手房信息.csv",engine = "python")
plt.scatter(data["经度"],data["纬度"],
s = data["房屋单价"]/500,
c = data["参考总价"],cmap = "Reds",
alpha = 0.8)
plt.grid(linestyle = "dashed")
data.head()
# 极差
def d_range(df,*cols):
krange = []
for col in cols:
crange = df[col].max() - df[col].min()
krange.append(crange)
return(krange)
key1 = '参考总价'
key2 = '参考首付'
dr = d_range(data,key1,key2)
print("%s极差为:%.2f\n%s极差为:%.2f" % (key1,dr[0],key2,dr[1]))
参考总价极差为:175.00
参考首付极差为:52.50
# 频率分布情况
data[key1].hist(bins = 8)
# 频率分布情况,分组区间
gcut = pd.cut(data[key1],10,right = False)
gcut_count = gcut.value_counts(sort = False)
gcut_count
[25.0, 42.5) 14
[42.5, 60.0) 17
[60.0, 77.5) 1
[77.5, 95.0) 2
[95.0, 112.5) 4
[112.5, 130.0) 2
[130.0, 147.5) 3
[147.5, 165.0) 4
[165.0, 182.5) 8
[182.5, 200.175) 20
Name: 参考总价, dtype: int64
data["%s分组区间" % key1] = gcut
data.head()
# 区间出现频率
r_zj = pd.DataFrame(gcut_count)
r_zj.rename(columns={gcut_count.name:"频数"},inplace = True)
r_zj["频率"] = r_zj["频数"]/r_zj["频数"].sum()
r_zj["累计频率"] = r_zj["频率"].cumsum()
r_zj["频率%"] = r_zj["频率"].apply(lambda x:"%.2f%%" % (x*100))
r_zj["累计频率%"] = r_zj["累计频率"].apply(lambda x:"%.2f%%" % (x*100))
r_zj.style.bar(subset=["频率","累计频率"])
# 直方图
r_zj["频率"].plot(kind = "bar",
figsize = (10,6),
grid = True,
color = "b",
alpha = 0.4)
# 添加频数标签
x = len(r_zj)
y = r_zj["频率"]
m = r_zj["频数"]
for i,j,k in zip(range(x),y,m):
plt.text(i-0.1,j+0.001,"%d" % k)
# 频率分布,定性字段
cx_g = data["朝向"].value_counts(sort = True)
r_cx = pd.DataFrame(cx_g)
r_cx.rename(columns = {cx_g.name:"频数"},inplace = True)
r_cx["频率"] = r_cx/r_cx["频数"].sum()
r_cx["累计频率"] = r_cx["频率"].cumsum()
r_cx["频率%"] = r_cx["频率"].apply(lambda x:"%.2f%%" % (x*100))
r_cx["累计频率%"] = r_cx["累计频率"].apply(lambda x:"%.2f%%" % (x*100))
r_cx.style.bar(subset=["频率","累计频率"])
# 绘制频率直方图
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.style.use('ggplot')
plt.figure(num=1,figsize=(12,5))
r_cx["频率"].plot(kind = "bar",
width = 0.6,
rot = 0)
plt.title("朝向分布频率直方图")
# 绘制饼图
plt.figure(num=2,figsize=(6,6))
plt.pie(r_cx["频率"],
labels=r_cx.index,
autopct="%.2f%%",
shadow=True)
plt.axis("equal")
1.2 对比分析
上一篇: Python+Excel+Word一秒制作百份合同
下一篇: 一款门罗币挖矿病毒木马分析