python 实现描述性统计、频数分布图、正态分布检验、概率密度曲线拟合
程序员文章站
2024-01-05 17:00:58
...
- 描述性统计
- 频数分布图
- 正态分布检验
- 概率密度曲线拟合
#单个项目数据分析
#单个项目描述性统计
from scipy.stats import chi2 # 卡方分布
from scipy.stats import norm # 正态分布
from scipy.stats import t # t分布
from scipy.stats import f # F分布
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import chi2_contingency # 列联表分析
# rvs: Random Variates
# pdf: Probability Density Function 概率密度函数
# cdf: Cumulative Distribution Function 概率密度函数的积分函数
# sf: Survival Function (1-CDF)
# ppf: Percent Point Function (Inverse of CDF) 百分点函数,概率密度函数的积分值
# isf: Inverse Survival Function (Inverse of SF)
# stats: Return mean, variance, (Fisher’s) skew, or (Fisher’s) kurtosis
# moment: non-central moments of the distribution
# ppf以概率的形式,查询函数值-----------类似分布临界表
# example -------------------------------------------------------- 对连续数据进行正态拟合
plt.figure()
train = pd.read_csv("test.csv")
train_Age = train.dropna(subset=['p1'])
M_S = stats.norm.fit(train_Age['p1']) # 正态拟合的平均值与标准差
plt.hist(train_Age['p1'],bins=30, normed=1, facecolor='blue', alpha=0.5) # 原本的概率直方图
train_Age['p1'].plot(kind='kde',secondary_y=True) # 原本的概率密度分布图
normalDistribution = stats.norm(M_S[0], M_S[1]) # 绘制拟合的正态分布图
#x = np.linspace(normalDistribution.ppf(0.01), normalDistribution.ppf(0.99), 100)
x = np.linspace(normalDistribution.ppf(0.01), normalDistribution.ppf(0.99), 100)
plt.plot(x, normalDistribution.pdf(x), c='orange')
plt.xlabel('reqirement time')
plt.title('reqirement time on NormalDistribution', size=20)
plt.legend(['Origin', 'NormDistribution'])
from scipy import integrate
for n in range(0,400,1):
x=np.linspace(0,n,1000)
y=normalDistribution.pdf(x)
p=integrate.trapz(y, x)
if p>0.8:
print (n)
break
#多个项目批量输出分析结果
from scipy.stats import chi2 # 卡方分布
from scipy.stats import norm # 正态分布
from scipy.stats import t # t分布
from scipy.stats import f # F分布
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import chi2_contingency # 列联表分析
from scipy import integrate
from scipy.stats import kstest #检验正态分布
from statsmodels.stats.diagnostic import lillifors
# rvs: Random Variates
# pdf: Probability Density Function 概率密度函数
# cdf: Cumulative Distribution Function 概率密度函数的积分函数
# sf: Survival Function (1-CDF)
# ppf: Percent Point Function (Inverse of CDF) 百分点函数,概率密度函数的积分值
# isf: Inverse Survival Function (Inverse of SF)
# stats: Return mean, variance, (Fisher’s) skew, or (Fisher’s) kurtosis
# moment: non-central moments of the distribution
# ppf以概率的形式,查询函数值-----------类似分布临界表
train = pd.read_csv("test.csv")
#设置画布大小
fig = plt.figure(figsize=(16, 16))
#循环:依次计算p1~p46
#projects_index = ['p1', 'p2', 'p3', 'p4','p5', 'p6', 'p7', 'p8','p9', 'p10','p11', 'p12', 'p13', 'p14','p15', 'p16', 'p17', 'p18','p19', 'p20', 'p21', 'p22', 'p23', 'p24','p25', 'p26', 'p27', 'p28','p29', 'p30', 'p31', 'p32', 'p33', 'p34','p35', 'p36', 'p37', 'p38','p39', 'p40','p41', 'p42', 'p43', 'p44','p45', 'p46' ]
projects_index = ['p1', 'p2', 'p3', 'p4','p5', 'p6', 'p7', 'p8','p9', 'p10',
'p11', 'p12', 'p14', 'p15', 'p16', 'p17', 'p18','p19', 'p20',
'p21', 'p22', 'p23', 'p24','p25', 'p26', 'p27', 'p28','p29', 'p30',
'p31', 'p32', 'p33', 'p34','p35', 'p36', 'p38', 'p40',
'p41', 'p42', 'p43', 'p45', 'p46' ]
#projects_index = ['p1','p2','p3','p4','p5', 'p6', 'p7', 'p8']
count=0
#计算统计量
#创建一个空的Dataframe
Req_Leadtime = pd.DataFrame(columns=('项目名称','有效需求总数','平均值','中位数','众数','标准差','p值','需求交付周期阈值'))
for index in projects_index:
count+=1 #计数
#print(count)
print(index)
train_time = train.dropna(subset=[index])
# 正态拟合的平均值与标准差
M_S = stats.norm.fit(train_time[index])
# 0-画子图
#ax = fig.add_subplot(14, 3, count)
#1-依次作图保存
# 原本的概率直方图
plt.hist(train_time[index],bins=30, normed=1, facecolor='blue', alpha=0.5)
# 原本的概率密度分布图
train_time[index].plot(kind='kde',secondary_y=True)
# 绘制拟合的正态分布图
normalDistribution = stats.norm(M_S[0], M_S[1])
x = np.linspace(normalDistribution.ppf(0.01), normalDistribution.ppf(0.99), 100)
plt.plot(x, normalDistribution.pdf(x), c='orange')
#计算平均值
avg_leadtime = np.mean(train_time[index])
#计算中位数
med_leadtime = np.median(train_time[index])
#计算众数
mode_leadtime = stats.mode(train_time[index])[0][0]
#计算标准差
#std_leadtime = pd.std(train_time[index])
std_leadtime = np.std(train_time[index], ddof=1)
#检验正态分布,from scipy.stats import kstest,p为正态检验p值,>0.05
#rvs:待检验的数据
#cdf:检验方法,这里我们设置为‘norm’,即正态性检验
#alternative:默认为双尾检验,可以设置为‘less’或‘greater’作单尾检验
#q,p = kstest(train_time[index], 'norm',(avg_leadtime,std_leadtime))
#print(q,p)
#p = stats.shapiro(train_time[index])
if len(train_time[index])<8:
p = -1
else:
if len(train_time[index])<50:
q,p = stats.normaltest(train_time[index])
else:
if 50<=(len(train_time[index]))<=300:
#p = lillifors(train_time[index])
q,p = kstest(train_time[index], 'norm')
else:
q,p = kstest(train_time[index], 'norm')
print(p)
#计算80%需求交付周期
for n in range(0,400,1):
x=np.linspace(0,n,1000)
y=normalDistribution.pdf(x)
h=integrate.trapz(y, x)
if h>0.8:
print (n)
break
#计算统计量
#将计算结果逐行插入Req_Leadtime,注意变量要用[]括起来,同时ignore_index=True,否则会报错,ValueError: If using all scalar values, you must pass an index
valid_req_counts = len(train_time[index])
req_leadtime_avg = avg_leadtime
req_leadtime_med = med_leadtime
req_leadtime_mode = mode_leadtime
req_leadtime_std = std_leadtime
req_leadtime_p = p
req_leadtime_ref = n
Req_Leadtime = Req_Leadtime.append(pd.DataFrame({'项目名称':[index],
'有效需求总数':[valid_req_counts],
'平均值':[req_leadtime_avg],
'中位数':[req_leadtime_med],
'众数':[req_leadtime_mode],
'标准差':[req_leadtime_std],
'p值':[req_leadtime_p],
'需求交付周期阈值':[req_leadtime_ref]}),ignore_index=True)
print(req_leadtime_p,p)
plt.xlabel('Req leadtime')
plt.title("(%s) Req_leadtime on NormalDistribution, p = %10.3
f, Req_leadtime_req = %d" %(index,p,n),fontsize=20)
plt.legend(['Origin', 'NormDistribution'])
# 0-自动调整子图的间距
#plt.tight_layout()
#保存图片
plt.savefig("image/'" + index + "'.png")
#单独输出图片,需要清空画布
plt.clf()
#输出统计量
print(Req_Leadtime)
#输出到txt中
Req_Leadtime.to_excel('Req_Leadtime.xlsx', encoding='utf-8', index=True, header=True)
下一篇: Java中if条件语句的使用