To read the stock prices (adjusted prices) data, we use the R package “quantmod”.

if (!require("quantmod")) {
  install.packages("quantmod")
  require("quantmod")
}
#If you do not specify the "to" option, then the end date of the data will be today.
getSymbols('CM',from="2000-01-01",to="2015-10-05")  
##     As of 0.4-0, 'getSymbols' uses env=parent.frame() and
##  auto.assign=TRUE by default.
## 
##  This  behavior  will be  phased out in 0.5-0  when the call  will
##  default to use auto.assign=FALSE. getOption("getSymbols.env") and 
##  getOptions("getSymbols.auto.assign") are now checked for alternate defaults
## 
##  This message is shown once per session and may be disabled by setting 
##  options("getSymbols.warning4.0"=FALSE). See ?getSymbols for more details.
## Warning in download.file(paste(yahoo.URL, "s=", Symbols.name, "&a=",
## from.m, : downloaded length 251563 != reported length 200
## [1] "CM"
Z <- CM
str(Z)
## An 'xts' object on 2000-01-03/2015-10-05 containing:
##   Data: num [1:3964, 1:6] 23.6 23 22.8 23.1 23.1 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:6] "CM.Open" "CM.High" "CM.Low" "CM.Close" ...
##   Indexed by objects of class: [Date] TZ: UTC
##   xts Attributes:  
## List of 2
##  $ src    : chr "yahoo"
##  $ updated: POSIXct[1:1], format: "2015-10-06 21:21:41"
dim(Z)  #the length of the data
## [1] 3964    6
head(Z) # show the first 6 rows of data
##            CM.Open CM.High CM.Low CM.Close CM.Volume CM.Adjusted
## 2000-01-03  23.625 23.6250 23.000  23.0000     21900    10.99155
## 2000-01-04  23.000 23.0625 22.625  23.0000     15000    10.99155
## 2000-01-05  22.750 23.4375 22.750  23.3750     11000    11.17076
## 2000-01-06  23.125 23.3125 23.000  23.1250      6200    11.05129
## 2000-01-07  23.125 23.1250 22.500  23.0000     13000    10.99155
## 2000-01-10  22.875 22.9375 22.625  22.6875      6700    10.84221
tail(Z) # show the last 6 rows of data
##            CM.Open CM.High CM.Low CM.Close CM.Volume CM.Adjusted
## 2015-09-28   69.42   69.91  69.03    69.03    313800       69.03
## 2015-09-29   69.23   69.96  68.87    69.86    226300       69.86
## 2015-09-30   70.49   72.07  70.47    71.96    300700       71.96
## 2015-10-01   72.66   72.86  72.16    72.60    334400       72.60
## 2015-10-02   71.99   73.02  71.33    73.01    458300       73.01
## 2015-10-05   73.71   74.99  73.52    74.88    340600       74.88

Separate the

train <- Z[1:2000]
test <- Z[2001:3956]

You have several ways to visualize the data.

#From the xts package
plot(train[,6], main = "Adjusted price, CIBC stock")

#From the quantmod package
chartSeries(train[,6], theme = "white", name = "Adjusted price, CIBC stock")

#From the lattice package
library(lattice)
xyplot(train[,6], main = "Adjusted price, CIBC stock")

Next, we compute the log return for the stock price.

R <- diff( log(train[,6]) )
head(R)
##             CM.Adjusted
## 2000-01-03           NA
## 2000-01-04  0.000000000
## 2000-01-05  0.016172850
## 2000-01-06 -0.010752755
## 2000-01-07 -0.005420095
## 2000-01-10 -0.013680034
R <- R[-1,]
plot(R, main = "Log returns, CIBC stock")

Let Y_t=|R_t| and consider the scatterplot of Y_t vsY_t-1

Ra <- abs(R)
Ra <- as.numeric(Ra)
plot( Ra[-length(Ra)], Ra[-1], 
      xlab = expression(Y[t-1]), ylab = expression(Y[t]),
      main = paste(expression(Y[t]),"vs",expression(Y[t-1])),
      pch=".", cex=4 )

If we change the figure scale to be log-log

summary(Ra)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.003729 0.007988 0.010840 0.014370 0.083300
#Two outliers
plot( Ra[-length(Ra)], Ra[-1], log = "xy",
      xlab = expression(Y[t-1]), ylab = expression(Y[t]),
      main = paste(expression(Y[t]),"vs",expression(Y[t-1]),"loglog"),
      pch=".", cex=4 )
## Warning in xy.coords(x, y, xlabel, ylabel, log): 19 x values <= 0 omitted
## from logarithmic plot
## Warning in xy.coords(x, y, xlabel, ylabel, log): 18 y values <= 0 omitted
## from logarithmic plot

#So we perform a shifted log transformation
Raslog <- log(Ra+0.001)

xslog <- Raslog[-length(Raslog)]
yslog <- Raslog[-1]

plot( xslog, yslog, 
      xlab = expression(Y[t-1]), ylab = expression(Y[t]),
      main = paste("shifted log",expression(Y[t]),"vs","shifted log",expression(Y[t-1])),
      pch=".", cex=4 )
points(xslog[xslog==log(0.001)],yslog[xslog==log(0.001)],col="red")
points(xslog[yslog==log(0.001)],yslog[yslog==log(0.001)],col="red")

Fit a linear model

#Let's fit a least sum of residual squares regression
m1 <- lm(yslog~xslog)
summary(m1)
## 
## Call:
## lm(formula = yslog ~ xslog)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.39111 -0.57949  0.06551  0.60846  2.40513 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.10380    0.10765 -38.120  < 2e-16 ***
## xslog        0.14188    0.02214   6.408 1.83e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8574 on 1996 degrees of freedom
## Multiple R-squared:  0.02016,    Adjusted R-squared:  0.01967 
## F-statistic: 41.06 on 1 and 1996 DF,  p-value: 1.832e-10
#Thought the slope term is significant, its value is quite low which means the linear correlation between consecutive returns is small.

#We can also check the assumption of normal residuals
qqnorm(resid(m1))
qqline(resid(m1))

#Let's try a robust regression, the least median of squares regression
library(MASS)
m2 <- lmsreg(yslog~xslog)
#Thought it is robust method, the LME may give different estimates as its solution is not unique.

#Visualize the fitted model
plot( xslog, yslog, 
      xlab = expression(Y[t-1]), ylab = expression(Y[t]),
      main = paste("shifted log",expression(Y[t]),"vs","shifted log",expression(Y[t-1])),
      pch=".", cex=4 )
abline(m1) #Black line: LSE Line
abline(m2,col="red") #Red line: least median of squares regression

legend("bottomright",legend = c("LSE","LME"),col=c("black","red"),lty=c(1,1))

For the test subset

Rtest <- diff( log(test[,6]) )
Rtest <- Rtest[-1,]
Ratest <- abs(Rtest)
Ratest <- as.numeric(Ratest)

xtest <- Ratest[-length(Ratest)]
ytest <- Ratest[-1]

#Three criteria
# predict(m1) will give the fitted values for the training set


#For the test data set
#Mean squared prediction error (MSPE)
sqrt(mean((exp( predict( m1,newdata = data.frame(xslog=log(xtest+0.001)) ) )-0.001-ytest)^2))  
## [1] 0.01594851
#Mean absolute errors (MAE)
mean(abs(exp( predict( m1,newdata = data.frame(xslog=log(xtest+0.001)) ) )-0.001-ytest)) 
## [1] 0.008827386
#Mean absolute percentage error (MAPE)
mean( abs( (exp( predict( m1,newdata = data.frame(xslog=log(xtest+0.001)) ) )-0.001-ytest)/ytest)  ) 
## [1] Inf
#there are 0 values in ytest, so instead of looking at mean, we look at median
summary(ytest)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.003544 0.007987 0.012770 0.015240 0.156100
median( abs( (exp( predict( m1,newdata = data.frame(xslog=log(xtest+0.001)) ) )-0.001-ytest)/ytest) )
## [1] 0.598594

This is just a sample code for Assignment 1. Feel free to modify the code and make appropriate comments by yourself.