欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

R语言-ISLR包的weekly数据集进行logistic回归

程序员文章站 2022-03-23 08:31:29
...

glm()函数用与拟合广义线性模型,其中参数family:每一种响应分布(指数分布族)允许各种关联函数将均值和线性预测器关联起来
常用family:
binomal(link=‘logit’) ----响应变量服从二项分布,连接函数为logit,即logistic回归
binomal(link=‘probit’) ----响应变量服从二项分布,连接函数为probit
poisson(link=‘identity’) ----响应变量服从泊松分布,即泊松回归
1.加载相关包并查看数据

library(ISLR)
library(broom)
library(tidyverse)
library(ggplot2)
library(MASS)
library(class)
library(caret)
library(e1071)
glimpse(Weekly)
## Observations: 1,089
## Variables: 9
## $ Year      <dbl> 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 199...
## $ Lag1      <dbl> 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807...
## $ Lag2      <dbl> 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372...
## $ Lag3      <dbl> -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178...
## $ Lag4      <dbl> -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.71...
## $ Lag5      <dbl> -3.484, -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.5...
## $ Volume    <dbl> 0.1549760, 0.1485740, 0.1598375, 0.1616300, 0.1537280, 0....
## $ Today     <dbl> -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0.041...
## $ Direction <fct> Down, Down, Up, Up, Up, Down, Up, Up, Up, Down, Down, Up,...

2.对weekly数据进行数值和图像描述统计

summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
#cor计算相关系数
cor(Weekly[,-9])
##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000
#绘制散点矩阵图
pairs(Weekly[,-9])

R语言-ISLR包的weekly数据集进行logistic回归
从相关系数矩阵和散点矩阵图可以看出:滞后时间变量Lag1~Lag2之间没有显著性关系,但交易量Volume随时间不断有明显的增加

下面对Volume随时间变化的趋势绘图

#判断Weekly中Lag1列往下移一行的数据与TOday列是否对应相等,从而判断数据是否按周增加
#lead(1:5,n=2L)运行结果3  4  5 NA NA;
#lag(1:5,n=2L)运行结果NA NA  1  2  3
Weekly %>%
  filter(lead(Lag1) != Today) %>%
  nrow()
## [1] 0
#按年分类并找出每年第一周的周序号
Weekly$Week<-1:nrow(Weekly)
Year_breaks<-Weekly%>%group_by(Year)%>%summarise(Week=min(Week))
#按周绘制交易量随时间的变化折线图
ggplot(Weekly,aes(x=Week,y=Volume))+
  geom_line()+   #绘制折线图
  geom_smooth()+  #添加平滑趋势曲线
  theme_light() +  #设置主题
  scale_x_continuous(breaks = Year_breaks$Week,minor_breaks = NULL,labels = Year_breaks$Year)+
  #如何按自己的意愿设置X轴的标签
  labs(title = "Average Daily share trades vs Time", 
       x = "Time", 
       y = "volume")

R语言-ISLR包的weekly数据集进行logistic回归
下面绘制Direction随时间变化图,只有(2000、2001、2002、2008)这四年50%以上的周没看到正回报

#绘制堆积直方图
ggplot(Weekly,aes(x=Year,fill=Direction))+ 
  geom_bar(position = "fill")+
  geom_hline(yintercept = 0.5,col="black")+ #绘制y=0.5的水平参考线
  scale_x_continuous(breaks =seq(1990,2010),minor_breaks = NULL,labels = Year_breaks$Year )+
  scale_y_continuous(labels = scales::percent_format())+ #把y轴数值设为百分比制
  theme(axis.title.y =element_blank(),legend.position = "bottom")+  #取消y轴的标题
  ggtitle("% of Up/Down Weeks vs Time")

R语言-ISLR包的weekly数据集进行logistic回归

#分别计算出现Down和Up的概率
Week.probs<-prop.table(table(Weekly$Direction))
Week.probs
##      Down        Up 
## 0.4444444 0.5555556
#绘制随时间变化的周波动
ggplot(Weekly, aes(x = Week, y = Today/100 )) +  #Today/100进行百分比化处理
  geom_line()+
  scale_x_continuous(breaks = Year_breaks$Week,minor_breaks = NULL,labels = Year_breaks$Year)+
  scale_y_continuous(labels = scales::percent_format(),breaks = seq(-0.2,0.2,0.05))+
  geom_hline(yintercept = 0,col="grey55")+ #绘制基准线
  theme_light()+
  labs(title = "Weekly Percentage Return vs Time",
       x="Time",
       y="Percentage Return")

R语言-ISLR包的weekly数据集进行logistic回归
3.用整个数据集建立logistic回归

#进行logistic回归拟合
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data = Weekly,family = binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4
tidy(glm.fit) #将模型输出结果转化为数据框
## # A tibble: 7 x 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)   0.267     0.0859     3.11  0.00190
## 2 Lag1         -0.0413    0.0264    -1.56  0.118  
## 3 Lag2          0.0584    0.0269     2.18  0.0296 
## 4 Lag3         -0.0161    0.0267    -0.602 0.547  
## 5 Lag4         -0.0278    0.0265    -1.05  0.294  
## 6 Lag5         -0.0145    0.0264    -0.549 0.583  
## 7 Volume       -0.0227    0.0369    -0.616 0.538

结果输出的z-statistic(z-value)的计算方法跟线性回归的T检验一样,z值的绝对值较大表明拒绝原假设H0:βj=0
由各个预测变量的P值可以看出,Lag2是显著性变量

4.计算混淆矩阵和整体预测率

#预测函数predict
#参数response告诉R只用输出概率P(Y=1|X)
#如果不给predict提供预测预测数据集,它会自动拟合logistic回归的训练数据的概率
glm.probs=predict(glm.fit,type = "response") 
glm.probs[1:5] #这些值对应市场是上涨而不是下跌的概率
##         1         2         3         4         5 
## 0.6086249 0.6010314 0.5875699 0.4816416 0.6169013
#contrasts函数创建了一个哑变量 
contrasts(Weekly$Direction) 
##      Up
## Down  0
## Up    1
#将用logistic回归的训练数据的预测结果转化为变化方向
glm.pred=rep("Down",1089)
glm.pred[glm.probs>.5]='Up'
#计算预测结果与原来结果的混淆矩阵,从而计算预测一致的概率
attach(Weekly)
table(glm.pred,Direction)
##         Direction
## glm.pred Down  Up
##     Down   54  48
##     Up    430 557
mean(glm.pred==Direction)
## [1] 0.5610652

用caret::confusionMatrix计算混淆矩阵

#用caret::confusionMatrix计算混淆矩阵
attach(Weekly)
## The following objects are masked from Weekly (pos = 3):
## 
##     Direction, Lag1, Lag2, Lag3, Lag4, Lag5, Today, Volume, Week, Year
Predicted<-factor(ifelse(predict(glm.fit,type = "response")<.5,"Down","Up"))
confusionMatrix(Predicted,Direction,positive = "Up")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down  Up
##       Down   54  48
##       Up    430 557
##                                          
##                Accuracy : 0.5611         
##                  95% CI : (0.531, 0.5908)
##     No Information Rate : 0.5556         
##     P-Value [Acc > NIR] : 0.369          
##                                          
##                   Kappa : 0.035          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9207         
##             Specificity : 0.1116         
##          Pos Pred Value : 0.5643         
##          Neg Pred Value : 0.5294         
##              Prevalence : 0.5556         
##          Detection Rate : 0.5115         
##    Detection Prevalence : 0.9063         
##       Balanced Accuracy : 0.5161         
##                                          
##        'Positive' Class : Up   
prop.table(table(Predicted))
## Predicted
##       Down         Up 
## 0.09366391 0.90633609          

5.用2009年之前的训练数据拟合logistic回归模型,其中只把Lag2作为预测变量,计算混淆矩阵的和测试集(2009和2010年)中总体预测准确率

#用1990-2008年的训练数据来拟合logistic回归模型,只把lag2作为预测变量,计算2009-2010的预测准确率
attach(Weekly)
train=(Year<2009) #生成一个对应的布尔向量
#布尔向量可用于获取某个矩阵的行或子列
Weekly.2009=Weekly[!train,] #测试集数据
dim(Weekly.2009)  #查看该数据的维度
## [1] 104  10
glm.fit1<-glm(Direction~Lag2,data = Weekly,family = binomial,subset = train)
# or glm.fit1<-glm(Direction~Lag2,data = Weekly[train,],family = binomial)
summary(glm.fit1)
## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly, 
##     subset = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.536  -1.264   1.021   1.091   1.368  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4
glm.probs1<-predict(glm.fit1,Weekly.2009,type = "response")
glm.pred1<-rep("Down",104)
glm.pred1[glm.probs1>.5]="Up"
attach(Weekly.2009)
table(glm.pred1,Direction)
##          Direction
## glm.pred1 Down Up
##      Down    9  5
##      Up     34 56
mean(glm.pred1==Direction)
## [1] 0.625

用caret::confusionMatrix计算混淆矩阵

predicted1<-factor(ifelse(predict(glm.fit1,Weekly.2009,type = "response")<.5,"Down","Up"))
confusionMatrix(data = predicted1,reference=Weekly.2009$Direction,positive = "Up")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down    9  5
##       Up     34 56
##                                          
##                Accuracy : 0.625          
##                  95% CI : (0.5247, 0.718)
##     No Information Rate : 0.5865         
##     P-Value [Acc > NIR] : 0.2439         
##                                          
##                   Kappa : 0.1414         
##                                          
##  Mcnemar's Test P-Value : 7.34e-06       
##                                          
##             Sensitivity : 0.9180         
##             Specificity : 0.2093         
##          Pos Pred Value : 0.6222         
##          Neg Pred Value : 0.6429         
##              Prevalence : 0.5865         
##          Detection Rate : 0.5385         
##    Detection Prevalence : 0.8654         
##       Balanced Accuracy : 0.5637         
##                                          
##        'Positive' Class : Up             
## 

No Information Rate:0.5865,即表明测试数据的58.65%为最大分类(Up),因此这是分类器的一个基准线
Accuracy:0.625>0.5865
我们为单面测试提供了p值,以查看准确性是否优于“无信息率”。 P值[Acc> NIR]:0.2439> 0.05⟹没有明显的证据表明我们的分类器优于基线策略。

完整代码

library(ISLR)
library(broom)
library(tidyverse)
library(ggplot2)
library(MASS)
library(class)
library(caret)
library(e1071)
glimpse(Weekly)
summary(Weekly)
#cor计算相关系数
cor(Weekly[,-9])
#绘制散点矩阵图
pairs(Weekly[,-9])
#判断Weekly中Lag1列往下移一行的数据与TOday列是否对应相等,从而判断数据是否按周增加
#lead(1:5,n=2L)运行结果3  4  5 NA NA;
#lag(1:5,n=2L)运行结果NA NA  1  2  3
Weekly %>%
  filter(lead(Lag1) != Today) %>%
  nrow()
#按年分类并找出每年第一周的周序号
Weekly$Week<-1:nrow(Weekly)
Year_breaks<-Weekly%>%group_by(Year)%>%summarise(Week=min(Week))
#按周绘制交易量随时间的变化折线图
ggplot(Weekly,aes(x=Week,y=Volume))+
  geom_line()+   #绘制折线图
  geom_smooth()+  #添加平滑趋势曲线
  theme_light() +  #设置主题
  scale_x_continuous(breaks = Year_breaks$Week,minor_breaks = NULL,labels = Year_breaks$Year)+
  #如何按自己的意愿设置X轴的标签
  labs(title = "Average Daily share trades vs Time", 
       x = "Time", 
       y = "volume")
#绘制堆积直方图
ggplot(Weekly,aes(x=Year,fill=Direction))+ 
  geom_bar(position = "fill")+
  geom_hline(yintercept = 0.5,col="black")+ #绘制y=0.5的水平参考线
  scale_x_continuous(breaks =seq(1990,2010),minor_breaks = NULL,labels = Year_breaks$Year )+
  scale_y_continuous(labels = scales::percent_format())+ #把y轴数值设为百分比制
  theme(axis.title.y =element_blank(),legend.position = "bottom")+  #取消y轴的标题
  ggtitle("% of Up/Down Weeks vs Time")
#分别计算出现Down和Up的概率
Week.probs<-prop.table(table(Weekly$Direction))
Week.probs
#绘制随时间变化的周波动
ggplot(Weekly, aes(x = Week, y = Today/100 )) +  #Today/100进行百分比化处理
  geom_line()+
  scale_x_continuous(breaks = Year_breaks$Week,minor_breaks = NULL,labels = Year_breaks$Year)+
  scale_y_continuous(labels = scales::percent_format(),breaks = seq(-0.2,0.2,0.05))+
  geom_hline(yintercept = 0,col="grey55")+ #绘制基准线
  theme_light()+
  labs(title = "Weekly Percentage Return vs Time",
       x="Time",
       y="Percentage Return")
#进行logistic回归拟合
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data = Weekly,family = binomial)
summary(glm.fit)
tidy(glm.fit) #将模型输出结果转化为数据框
#预测函数predict
#参数response告诉R只用输出概率P(Y=1|X)
#如果不给predict提供预测预测数据集,它会自动拟合logistic回归的训练数据的概率
glm.probs=predict(glm.fit,type = "response") 
glm.probs[1:5] #这些值对应市场是上涨而不是下跌的概率
#contrasts函数创建了一个哑变量 
contrasts(Weekly$Direction) 
#将用logistic回归的训练数据的预测结果转化为变化方向
glm.pred=rep("Down",1089)
glm.pred[glm.probs>.5]='Up'
#计算预测结果与原来结果的混淆矩阵,从而计算预测一致的概率
attach(Weekly)
table(glm.pred,Direction)
mean(glm.pred==Direction)
#用caret::confusionMatrix计算混淆矩阵
attach(Weekly)
Predicted<-factor(ifelse(predict(glm.fit,type = "response")<.5,"Down","Up"))
confusionMatrix(Predicted,Direction,positive = "Up")
prop.table(table(Predicted))
#用1990-2008年的训练数据来拟合logistic回归模型,只把lag2作为预测变量,计算2009-2010的预测准确率
attach(Weekly)
train=(Year<2009) #生成一个对应的布尔向量
#布尔向量可用于获取某个矩阵的行或子列
Weekly.2009=Weekly[!train,] #测试集数据
dim(Weekly.2009)  #查看该数据的维度
glm.fit1<-glm(Direction~Lag2,data = Weekly,family = binomial,subset = train)
# or glm.fit1<-glm(Direction~Lag2,data = Weekly[train,],family = binomial)
summary(glm.fit1)
glm.probs1<-predict(glm.fit1,Weekly.2009,type = "response")
glm.pred1<-rep("Down",104)
glm.pred1[glm.probs1>.5]="Up"
attach(Weekly.2009)
table(glm.pred1,Direction)
mean(glm.pred1==Direction)
#用confusionMatrix生成混淆矩阵
predicted1<-factor(ifelse(predict(glm.fit1,Weekly.2009,type = "response")<.5,"Down","Up"))
confusionMatrix(data = predicted1,reference=Weekly.2009$Direction,positive = "Up")

参考kaggle搬运加工练习小姐妹
原文参考:https://www.kaggle.com/lmorgan95/islr-classification-ch-4-solutions

相关标签: r语言