Python编程--利用ENGS求最佳样本容量n--Bayes-5.18
(参考课本:《贝叶斯统计》-茆诗松-第一版-5.5最佳样本量的确定
例题5.18-5.19
题目:
某上考虑是否向一县办厂订购一种家用电器(以下简称电器)。该厂生产的电器有一等品和二等品两个等级,一等品与二等品的数量之比有1:1和2:1两种可能,其概率分别为0.45和0.55。
- 如果买到的是一等品,与一般市场价格相比较,每只可赚10元。
- 如果买到二等品,每只要亏15元。
假如该厂允许在一批电器中抽取若干只进行检验,根据抽样结果决定是否订购该批(900只)电器。但抽样总的费用为每只20元。这时商店必须考虑多少只最合算?
求上界
n
∗
≤
先验
E
V
P
I
−
C
f
C
v
n^* \le \frac{\text{先验}EVPI-C_f}{C_v}
n∗≤Cv先验EVPI−Cf
此题求得为41
求最佳样本量
- 若对每一个 n ( 1 , 2 , ⋯ , n ∗ ) n(1, 2, \cdots, n^*) n(1,2,⋯,n∗)值,都算得到 E N S I ( n ) ENSI(n) ENSI(n)和抽样成本 C ( n ) C(n) C(n)。
- 由1计算得到所有的 E N G S ( n ) ENGS(n) ENGS(n)。
- 找到2中最大的 E N G S ( n ) ENGS(n) ENGS(n)所对应的 n n n即为最佳的样本容量 n ∗ n^* n∗。
详细步骤
Step1:计算 θ \theta θ的后验分布
-
求 p ( x i ∣ θ ) p(x_i|\theta) p(xi∣θ); p ( x i ∣ θ ) = θ x i ( 1 − θ ) 1 − x i p(x_i|\theta)=\theta^{x_i}(1-\theta)^{1-x_i} p(xi∣θ)=θxi(1−θ)1−xi,计算出 x i x_i xi的分布。
-
求 m ( x i ) m(x_i) m(xi), m ( x i ) = ∑ j p ( x i ∣ θ j ) π ( θ j ) m(x_i)=\sum_j p(x_i|\theta_j)\pi(\theta_j) m(xi)=∑jp(xi∣θj)π(θj)。
-
求 p ( θ ∣ x i ) p(\theta|x_i) p(θ∣xi), p ( θ ∣ x i ) = p ( x i ∣ θ ) π ( θ ) m ( x i ) p(\theta|x_i)=\frac{p(x_i|\theta)\pi(\theta)}{m(x_i)} p(θ∣xi)=m(xi)p(xi∣θ)π(θ)
Step2:计算后验信息期望值(后验EVSI)
-
对每一个行动 a ( a 1 , a 2 ) a(a_1, a_2) a(a1,a2)计算其后验期望损失值 E θ ∣ x [ L ( θ , a ) ] E^{\theta | x}[L(\theta, a)] Eθ∣x[L(θ,a)]。
-
计算在抽样信息值 x 1 x_1 x1下的后验最优决策函数 δ ′ ( x ) . \delta'(x). δ′(x).
-
计算后验EVSI的期望值 E x { E θ ∣ x [ L ( θ , a ( x ) ) ] } E^x\{E^{\theta|x}[L(\theta,a(x))]\} Ex{Eθ∣x[L(θ,a(x))]}
Step3:计算抽样信息期望值EVSI
-
先验EVPI
-
后验EVPI
-
计算抽样信息期望值EVSI
E V S I = 先验 E V P I − 后验 E V P I EVSI = \text{先验}EVPI - \text{后验}EVPI EVSI=先验EVPI−后验EVPI
Step4: 计算抽样净益ENGS(n)
-
给出抽样成本 C ( n ) C(n) C(n).
-
由 E N G S ( n ) = E V S I ( n ) − C ( n ) ENGS(n)=EVSI(n) - C(n) ENGS(n)=EVSI(n)−C(n)计算得到抽样净益ENGS.
python编程实现
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
# ==================基本参数==================
PI_THETA = np.array([0.45, 0.55])
THETA = [[1/2, 1/2], [2/3, 1/3]]
THETA = np.array(THETA)
# 固定成本、一等品和二等品的收益亏损情况
C_F = 0
COST = np.array([10, -15])
# sample cost
C_V = 20
# 购进的电器数量
N = 900
# ==================一、计算上界==================
# 收益矩阵
def revenue_mat(COST=COST, THETA=THETA):
return COST.T @ THETA.T
# Q矩阵
def calc_Qmat(N=N):
R_mat = revenue_mat()
R_mat = np.concatenate((R_mat, [0, 0]), axis=0).reshape(2,2).T
Q_mat = N * R_mat
return Q_mat
# L矩阵--损失矩阵
def clac_Lmat(Q_mat:np.array):
col_max = np.max(Q_mat, axis=1, keepdims=True)
print(col_max)
L_mat = col_max - Q_mat
return L_mat
# 先验EVPI
def clac_afore_EVPI(L_mat, PI_THETA=PI_THETA):
afore_EVPI_mat = PI_THETA.T @ L_mat
return np.min(afore_EVPI_mat)
# 计算上界
def calc_bound(afore_EVPI, C_V=C_V):
return np.floor(afore_EVPI / C_V)
# ==================二、计算最佳样本量==================
class ENGS(object):
def __init__(self, n, L_mat, afore_EVPI, THETA=THETA, PI_THETA=PI_THETA, C_V=C_V):
self.n = n
self.L_mat = L_mat
self.afore_EVPI = afore_EVPI
self.THETA = THETA
self.PI_THETA = PI_THETA
self.C_V = C_V
# 计算一个分布列
def calc_pmf(self, p):
p_li = []
for i in range(self.n+1):
p_li.append(stats.binom.pmf(i, self.n, p))
return p_li
# x的条件分布|\theta
def calc_xi_condition_theta(self):
a_li = []
for pi in self.THETA:
a_li.append(self.calc_pmf(pi[1]))
self.xi_condition_theta = np.array(a_li).T
return np.array(a_li).T
# x_i 每个取值的边缘分布
def calc_mx(self):
xi_condition_theta = self.calc_xi_condition_theta()
mx = self.xi_condition_theta @ self.PI_THETA
self.mx = mx
return mx
# \theta的后验分布
def calc_theta_posterior(self):
xi_condition_theta = self.calc_xi_condition_theta()
part_up = xi_condition_theta @ np.diag(self.PI_THETA)
part_down = self.calc_mx()
theta_posterior = part_up / part_down.reshape(self.n+1,1)
self.theta_posterior = theta_posterior
return theta_posterior
# 后验期望损失
def calc_post_expect_loss(self):
theta_posterior = self.calc_theta_posterior()
post_expect_loss = theta_posterior @ self.L_mat
self.post_expect_loss = post_expect_loss
return post_expect_loss
# 后验EVPI期望
def calc_post_EVPI_expect(self):
cols = ['theta_1', 'theta_2']
post_expect_loss = self.calc_post_expect_loss()
df = pd.DataFrame(data=post_expect_loss)
df['min_loss_expect'] = np.min(post_expect_loss, axis=1)
df['delta_x'] = np.argmin(post_expect_loss, axis=1)
mx = self.calc_mx()
dada= df['min_loss_expect'] @ mx
return dada
# 抽样信息净益
def calc_EVSI(self, afore_EVPI, post_EVPI):
self.EVSI = afore_EVPI - post_EVPI
return afore_EVPI - post_EVPI
# 抽样净益
def calc_ENGS(self):
# return EVSI - C_V*n
post_expect_loss = self.calc_post_expect_loss()
post_EVPI_expect = self.calc_post_EVPI_expect()
EVSI = self.calc_EVSI(self.afore_EVPI, post_EVPI_expect)
self.ENGS_value = EVSI - C_V*self.n
Q_mat = calc_Qmat()
L_mat = clac_Lmat(Q_mat)
afore_EVPI = clac_afore_EVPI(L_mat)
n_star = calc_bound(afore_EVPI)
print(Q_mat)
print(L_mat)
print(afore_EVPI)
print(n_star)
# ==================二、计算最佳样本量--输出==================
def find_best_n(n_star=n_star):
lala = []
for i in range(1, int(n_star)+1):
sample = ENGS(i, L_mat, afore_EVPI)
sample.calc_ENGS()
lala.append([i, sample.ENGS_value])
df = pd.DataFrame(lala, columns=['n', 'ENGS'])
best_n, best_ENGS = df.sort_values(by='ENGS').iloc[-1]
best_sample = ENGS(int(best_n), L_mat, afore_EVPI)
best_sample.calc_ENGS()
print(best_sample.mx)
print(best_sample.theta_posterior)
print(best_sample.post_expect_loss)
return best_n, best_ENGS
best_n, best_ENGS = find_best_n()
print('best_n, best_ENGS:\n',best_n, best_ENGS)
# ==================三、绘图==================
# 绘图数据
data = []
for i in range(1, int(n_star)+1):
sample = ENGS(i, L_mat, afore_EVPI)
sample.calc_ENGS()
# print('EVSI', sample.EVSI)
data.append([i, sample.ENGS_value, sample.afore_EVPI, sample.EVSI])
# print(lala)
df = pd.DataFrame(lala, columns=['n', 'ENGS', 'EVPI', 'EVSI'])
x_data = list(range(1, int(n_star)+1))
y_data = df['ENGS']
y_data2 = [n*C_V for n in x_data]
# print(y_data2)
y_data3 = df['EVPI']
y_data4 = df['EVSI']
# 设置字体、正负号
plt.rcParams['font.family'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 画图
plt.plot(x_data,y_data,label='ENGS',color='r',linewidth=1.0,linestyle='-')
plt.plot(x_data,y_data2,label='C',color='k',linewidth=2.0,linestyle='-')
plt.plot(x_data,y_data3,label='EVPI',color='y',linewidth=1.0,linestyle='-')
plt.plot(x_data,y_data4,label='EVSI',color='b',linewidth=1.0,linestyle='-')
plt.legend(loc="upper right")
plt.xlabel('n')
plt.ylabel('y轴')
plt.plot(best_n, best_ENGS,'o')
plt.annotate('(7.0, 101.38736175411486)', xy=(best_n, best_ENGS), xytext=(best_n+2, best_ENGS-250),
arrowprops=dict(facecolor='black', shrink=0.05)
)
plt.axhline(y=best_ENGS,ls=":",c="k")#添加水平直线
plt.title("抽样净益期望值曲线") #设置标题及字体
plt.show()
plt.savefig('ENGS_line.jpg')
结果
# Q_mat
[[-2250. 0.]
[ 1500. 0.]]
# L_mat
[[2250. 0.]
[ 0. 1500.]]
# afore_EVPI 先验EVPI
824.9999999999998
# n_star 样本量的上界
41.0
# best_mx X的边缘分布
[0.03570584 0.13727513 0.24282675 0.26387907 0.19346297 0.09495295
0.02813018 0.00376711]
# $\theta|x$ theta的后验分布
[[0.09846078 0.90153922]
[0.17927046 0.82072954]
[0.30403621 0.69596379]
[0.46630025 0.53369975]
[0.63602288 0.36397712]
[0.77752321 0.22247679]
[0.87483888 0.12516112]
[0.93324167 0.06675833]]
# post_expect_loss 后验期望损失
[[ 221.5367646 1352.3088236 ]
[ 403.35853181 1231.09431213]
[ 684.08146558 1043.94568962]
[1049.17557035 800.54961977]
[1431.0514751 545.96568326]
[1749.42721706 333.71518862]
[1968.38748389 187.74167741]
[2099.79375089 100.13749941]]
# 最佳样本量、ENGS
best_n, best_ENGS:
7.0 101.38736175411486
上一篇: R语言由浅入深:一、初始R语言
下一篇: 代理服务器引起的用户信息错乱