Chapter 3 Empirical Properties of Financial Data
3.1 Exploring financial time series
Loading, exploring, merging, aggregating and presenting financial time series.
library(xts)
library(qrmdata)
library(qrmtools)
we denote a return series by \(X_1,..., X_n\) and assume that the returns have been formed by logarithmic differencing of a price, index or exchange-rate series \((S_t)_{t=0,1,...,n}\), so \(X_t = \ln(S_t/S_{t−1}), \ t = 1,...,n\).
3.1.1 Stock prices
Dow Jones stock price data:
data("DJ_const")
str(DJ_const)
## An 'xts' object on 1962-01-02/2015-12-31 containing:
## Data: num [1:13595, 1:30] NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:30] "AAPL" "AXP" "BA" "CAT" ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## List of 2
## $ src : chr "yahoo"
## $ updated: POSIXct[1:1], format: "2016-01-03 03:51:40"
We extract a time period and take the first 10 stocks:
= DJ_const['2006-12-29/2015-12-31', 1:10] DJdata
Use plot for zoo objects to get multiple plots:
plot.zoo(DJdata, xlab = "Time", main = "DJ (10 component series)")
= returns(DJdata) # or diff(log(DJdata))[-1,]
X head(X)
## AAPL AXP BA CAT
## 2007-01-03 -0.012334120897 -0.0026472451560 0.003707673683 -0.002775765300
## 2007-01-04 0.021952926933 -0.0073163443921 0.004029116838 -0.002619510253
## 2007-01-05 -0.007146655399 -0.0132718850725 -0.004253386164 -0.012869324130
## 2007-01-08 0.004926129847 0.0094260573927 -0.002358358218 0.001161734403
## 2007-01-09 0.079799667183 -0.0063865234723 -0.010625198954 0.005458576326
## 2007-01-10 0.046745828277 -0.0006746670108 0.014328641844 -0.004961130145
## CSCO CVX DD DIS
## 2007-01-03 0.0145298305107 -0.035436194360 0.0067520100735 -0.002044583519
## 2007-01-04 0.0259847195971 -0.009770024128 -0.0069572791680 0.007863637421
## 2007-01-05 0.0003513138828 0.003834480727 -0.0134369575441 -0.008156047838
## 2007-01-08 0.0056042225031 0.012676130506 0.0004161386786 0.009026136834
## 2007-01-09 -0.0056042225031 -0.011542923051 0.0064282683029 -0.001450278326
## 2007-01-10 0.0073491451876 -0.017423932059 -0.0024834283353 0.001160410990
## GE GS
## 2007-01-03 0.0202188821092 0.006848800109
## 2007-01-04 -0.0058109041379 -0.009360107362
## 2007-01-05 -0.0050457874538 0.001005264216
## 2007-01-08 -0.0002663387811 0.023239502819
## 2007-01-09 0.0000000000000 0.001716517333
## 2007-01-10 0.0002663387811 0.019554703696
plot.zoo(X, xlab = "Time", main = "Log-returns of 10 DJ component series")
Volatility clustering is the tendency for extreme returns to be followed by other extreme returns, although not necessarily with the same sign.
Aggregating log returns by summation for each column:
## Weekly
= apply.weekly(X, FUN = colSums)
X.w dim(X.w)
## [1] 470 10
plot.zoo(X.w, type = "h", xlab = "Time", main = "Weekly log-returns") # 'h' = histogram = vertical lines only
## Monthly
= apply.monthly(X, FUN = colSums)
X.m dim(X.m)
## [1] 108 10
plot.zoo(X.m, type = "h", xlab = "Time", main = "Monthly log-returns")
## Quarterly
= apply.quarterly(X, FUN = colSums)
X.q dim(X.q)
## [1] 36 10
plot.zoo(X.q, type = "h", xlab = "Time", main = "Quarterly log-returns")
3.1.2 Stock indexes
Load stock index data:
data("SP500") # S&P 500
data("FTSE") # FTSE
data("SMI") # SMI
plot.zoo(SP500, xlab = "Time", ylab = "S&P 500")
Merge the three time series:
= merge(SP500, FTSE, SMI)
all = c("S&P 500", "FTSE", "SMI")
nms colnames(all) = nms
plot.zoo(all, xlab = "Time", main = "All")
Merge and retain only days where all three time series are available:
= merge(SP500, FTSE, SMI, all = FALSE)
all.avail colnames(all.avail) = nms
plot.zoo(all.avail, xlab = "Time", main = "All available")
Compute returns:
= returns(SP500)
SP500.X = returns(FTSE)
FTSE.X = returns(SMI)
SMI.X = merge(SP500.X, FTSE.X, SMI.X, all = FALSE)
X colnames(X) = nms
plot.zoo(X, xlab = "Time", main = "Log-returns")
pairs(as.zoo(X), gap = 0, cex = 0.4)
Aggregating:
## By week
= apply.weekly(X, FUN = colSums)
X.w plot.zoo(X.w, xlab = "Time", main = "Weekly log-returns", type = "h")
## By month
= apply.monthly(X, FUN = colSums)
X.m plot.zoo(X.m, xlab = "Time", main = "Monthly log-returns", type = "h")
3.1.3 Exchange rates
Load exchange rate data:
data("GBP_USD") # 1 GBP in USD
data("EUR_USD") # 1 EUR in USD
data("JPY_USD") # 1 JPY in USD
data("CHF_USD") # 1 CHF in USD
= merge(GBP_USD, EUR_USD, JPY_USD, CHF_USD)
FX head(FX)
## GBP.USD EUR.USD JPY.USD CHF.USD
## 2000-01-01 1.6170 1.0057 0.009785693316 0.6285
## 2000-01-02 1.6170 1.0055 0.009789525208 0.6272
## 2000-01-03 1.6370 1.0258 0.009838646202 0.6392
## 2000-01-04 1.6357 1.0309 0.009682416731 0.6427
## 2000-01-05 1.6418 1.0322 0.009588647042 0.6432
## 2000-01-06 1.6466 1.0318 0.009502090460 0.6431
plot.zoo(FX, xlab = "Time", main = "Exchange rates to USD")
= returns(FX)
X plot.zoo(X, xlab = "Time", main = "Log-returns of exchange rates to USD",
col = c("black", "royalblue3", "maroon3", "darkorange2"))
3.1.4 Zero-coupon bond yields
Load zero-coupon bond yield data (in USD):
Note: Typically, \[ \text{yield} = \left(\left(\frac{\text{face value}}{\text{current bond price}}\right)^{\frac{1}{\text{maturity}}} - 1\right) \cdot 100\% \] (as \(\text{face value} = \text{current price} \cdot (1 + \text{yield})^\text{maturity}\)
data("ZCB_USD")
dim(ZCB_USD) # => 30-dimensional; each dimension is a maturity
## [1] 7509 30
head(ZCB_USD)
## 1y 2y 3y 4y 5y 6y 7y 8y 9y
## 1985-11-25 7.8551 8.3626 8.7469 9.0408 9.2686 9.4481 9.5924 9.7110 9.8108
## 1985-11-26 7.8527 8.3575 8.7420 9.0376 9.2677 9.4494 9.5956 9.7154 9.8158
## 1985-11-27 7.8425 8.3522 8.7339 9.0231 9.2453 9.4193 9.5586 9.6727 9.7688
## 1985-11-29 7.7914 8.3148 8.6991 8.9852 9.2024 9.3712 9.5061 9.6172 9.7115
## 1985-12-02 7.8176 8.3457 8.7359 9.0294 9.2550 9.4327 9.5766 9.6962 9.7983
## 1985-12-03 7.8527 8.3679 8.7540 9.0476 9.2746 9.4535 9.5974 9.7156 9.8148
## 10y 11y 12y 13y 14y 15y 16y 17y
## 1985-11-25 9.8969 9.9727 10.0408 10.1029 10.1605 10.2143 10.2649 10.3128
## 1985-11-26 9.9015 9.9762 10.0425 10.1023 10.1569 10.2073 10.2542 10.2981
## 1985-11-27 9.8515 9.9246 9.9903 10.0503 10.1060 10.1579 10.2069 10.2531
## 1985-11-29 9.7938 9.8673 9.9343 9.9963 10.0541 10.1086 10.1600 10.2087
## 1985-12-02 9.8874 9.9667 10.0383 10.1038 10.1641 10.2200 10.2720 10.3206
## 1985-12-03 9.8996 9.9733 10.0383 10.0964 10.1488 10.1965 10.2401 10.2802
## 18y 19y 20y 21y 22y 23y 24y 25y
## 1985-11-25 10.3583 10.4016 10.4428 10.4821 10.5196 10.5553 10.5893 10.6217
## 1985-11-26 10.3393 10.3782 10.4150 10.4497 10.4827 10.5139 10.5435 10.5715
## 1985-11-27 10.2969 10.3385 10.3781 10.4157 10.4514 10.4853 10.5176 10.5482
## 1985-11-29 10.2549 10.2987 10.3403 10.3798 10.4172 10.4526 10.4861 10.5179
## 1985-12-02 10.3660 10.4085 10.4483 10.4856 10.5206 10.5534 10.5842 10.6131
## 1985-12-03 10.3172 10.3515 10.3832 10.4127 10.4402 10.4658 10.4897 10.5121
## 26y 27y 28y 29y 30y
## 1985-11-25 10.6526 10.6819 10.7099 10.7365 10.7618
## 1985-11-26 10.5982 10.6234 10.6474 10.6701 10.6917
## 1985-11-27 10.5773 10.6049 10.6311 10.6560 10.6796
## 1985-11-29 10.5479 10.5763 10.6032 10.6287 10.6528
## 1985-12-02 10.6402 10.6657 10.6897 10.7123 10.7335
## 1985-12-03 10.5330 10.5526 10.5710 10.5882 10.6044
= ZCB_USD['2002-01-02/2011-12-30']
ZCB plot.zoo(ZCB, xlab = "Time", main = "Percentage yields")
Compute differences (first row is removed):
= returns(ZCB, method = "diff") X
Pick 3 maturities:
= X[, c(1, 5, 30)]
X3 plot.zoo(X3, xlab = "Time", main = "Differences (3 maturities)")
pairs(as.zoo(X3), gap = 0, cex = 0.4)
Plot the corresponding “pseudo-observations” (componentwise scaled ranks):
= apply(X3, 2, rank) / (ncol(X3) + 1)
U3 pairs(as.zoo(U3), gap = 0, cex = 0.4)
## Note: More on that ("pseudo-observations") in Chapter 7
3.2 Ljung-Box test of serial independence
Ljung-Box tests of independence.
library(xts)
library(qrmdata)
library(qrmtools)
3.2.1 Constituent data
Dow Jones constituent data:
data("DJ_const")
We extract a time period and remove the time series ‘Visa’ (as it only has a very short history in the index):
= DJ_const['2006-12-29/2015-12-31',-which(names(DJ_const) == "V")] DJdata
Use plot for zoo objects to get multiple plots:
plot.zoo(DJdata, xlab = "Time", main = "DJ component series (without Visa)")
Build log-returns and aggregate to obtain monthly log-returns:
= returns(DJdata) # could also work with negative log-returns
X = apply.monthly(X, FUN = colSums) X.m
3.2.2 Ljung-Box tests of serial independence of stationary data
Compute (lists of) Ljung-Box tests:
\(H_0\): The data are independently distributed.
\(H_1\): The data are not independently distributed; they exhibit serial correlation.
= apply(X, 2, Box.test, lag = 10, type = "Ljung-Box")
LB.raw = apply(abs(X), 2, Box.test, lag = 10, type = "Ljung-Box") # could also work with squared log-returns
LB.abs = apply(X.m, 2, Box.test, lag = 10, type = "Ljung-Box")
LB.raw.m = apply(abs(X.m), 2, Box.test, lag = 10, type = "Ljung-Box") LB.abs.m
Extract p-values:
sapply
is likelapply
, but returning a vector or matrix instead of a list. Here[[
is an extract function which gives a value, e.g.LB.raw[["AAPL"]][["p.value"]]
.
= sapply(LB.raw, `[[`, "p.value")
p.LB.raw = sapply(LB.abs, `[[`, "p.value")
p.LB.abs = sapply(LB.raw.m, `[[`, "p.value")
p.LB.raw.m = sapply(LB.abs.m, `[[`, "p.value")
p.LB.abs.m round(cbind(p.LB.raw, p.LB.abs, p.LB.raw.m, p.LB.abs.m), 2)
## p.LB.raw p.LB.abs p.LB.raw.m p.LB.abs.m
## AAPL 0.14 0 0.54 0.00
## AXP 0.00 0 0.01 0.00
## BA 0.03 0 0.13 0.00
## CAT 0.08 0 0.01 0.00
## CSCO 0.11 0 0.32 0.14
## CVX 0.00 0 0.60 0.30
## DD 0.00 0 0.09 0.02
## DIS 0.00 0 0.00 0.04
## GE 0.00 0 0.17 0.00
## GS 0.00 0 0.61 0.00
## HD 0.25 0 0.12 0.42
## IBM 0.00 0 0.67 0.64
## INTC 0.00 0 0.65 0.90
## JNJ 0.00 0 0.25 0.30
## JPM 0.00 0 0.62 0.00
## KO 0.00 0 0.01 0.22
## MCD 0.00 0 0.10 0.60
## MMM 0.00 0 0.36 0.51
## MRK 0.03 0 0.45 0.01
## MSFT 0.00 0 0.80 0.90
## NKE 0.00 0 0.79 0.73
## PFE 0.00 0 0.32 0.05
## PG 0.00 0 0.78 0.04
## TRV 0.00 0 0.30 0.81
## UNH 0.00 0 0.03 0.00
## UTX 0.00 0 0.02 0.08
## VZ 0.00 0 0.10 0.43
## WMT 0.00 0 0.83 0.86
## XOM 0.00 0 0.60 0.40
3.2.3 Reproduce Ljung-Box tests from the QRM book
Up to minor differences (see below), this reproduces Table 3.1 in the MFE (2015):
## Note: This uses older DJ data from 'QRM' package
= QRM::DJ
DJ.QRM = as.xts(DJ.QRM)
DJ.old = DJ.old['1993-01-01/2000-12-31']
DJ.old = returns(DJ.old)
X.old = apply.monthly(X.old, FUN = colSums) X.old.m
Compute (lists of) Ljung-Box tests:
= apply(X.old, 2, Box.test, lag = 10, type = "Ljung-Box")
LB.raw = apply(abs(X.old), 2, Box.test, lag = 10, type = "Ljung-Box")
LB.abs = apply(X.old.m, 2, Box.test, lag = 10, type = "Ljung-Box")
LB.raw.m = apply(abs(X.old.m), 2, Box.test, lag = 10, type = "Ljung-Box") LB.abs.m
Extract p-values:
= sapply(LB.raw, `[[`, "p.value")
p.LB.raw = sapply(LB.abs, `[[`, "p.value")
p.LB.abs = sapply(LB.raw.m, `[[`, "p.value")
p.LB.raw.m = sapply(LB.abs.m, `[[`, "p.value")
p.LB.abs.m res = round(cbind(p.LB.raw, p.LB.abs, p.LB.raw.m, p.LB.abs.m), 2)) (
## p.LB.raw p.LB.abs p.LB.raw.m p.LB.abs.m
## AA 0.00 0 0.23 0.02
## AXP 0.02 0 0.54 0.07
## T 0.11 0 0.70 0.02
## BA 0.03 0 0.90 0.15
## CAT 0.28 0 0.73 0.07
## C 0.09 0 0.91 0.49
## KO 0.00 0 0.50 0.03
## DD 0.03 0 0.74 0.00
## EK 0.15 0 0.59 0.54
## XOM 0.00 0 0.33 0.22
## GE 0.00 0 0.25 0.09
## GM 0.64 0 0.82 0.26
## HWP 0.10 0 0.21 0.02
## HD 0.00 0 0.00 0.41
## HON 0.44 0 0.07 0.30
## INTC 0.23 0 0.80 0.62
## IBM 0.18 0 0.67 0.27
## IP 0.15 0 0.01 0.09
## JPM 0.52 0 0.43 0.12
## JNJ 0.00 0 0.10 0.89
## MCD 0.28 0 0.72 0.67
## MRK 0.05 0 0.54 0.65
## MSFT 0.28 0 0.19 0.13
## MMM 0.00 0 0.57 0.33
## MO 0.01 0 0.69 0.82
## PG 0.02 0 0.99 0.73
## SBC 0.05 0 0.12 0.00
## UTX 0.00 0 0.12 0.01
## WMT 0.00 0 0.42 0.63
## DIS 0.44 0 0.01 0.52
Note: The minor differences to the tables in the book come from a different approach. The tables in the book were produces without ‘xts’ objects:
library(timeSeries) # Caution: returns() now from timeSeries
## Loading required package: timeDate
##
## Attaching package: 'timeSeries'
## The following object is masked from 'package:zoo':
##
## time<-
## The following object is masked from 'package:qrmtools':
##
## returns
= returns(DJ.QRM)
X.old. = window(X.old., start = timeDate("1993-01-01"), end = timeDate("2000-12-31"))
X.old. = aggregate(X.old., by = unique(timeLastDayInMonth(time(X.old.))), sum)
X.old.m. detach("package:timeSeries")
Compute (lists of) Ljung-Box tests:
= apply(X.old., 2, Box.test, lag = 10, type = "Ljung-Box")
LB.raw. = apply(abs(X.old.), 2, Box.test, lag = 10, type = "Ljung-Box")
LB.abs. = apply(X.old.m., 2, Box.test, lag = 10, type = "Ljung-Box")
LB.raw.m. = apply(abs(X.old.m.), 2, Box.test, lag = 10, type = "Ljung-Box") LB.abs.m.
Extract p-values:
= sapply(LB.raw., `[[`, "p.value")
p.LB.raw. = sapply(LB.abs., `[[`, "p.value")
p.LB.abs. = sapply(LB.raw.m., `[[`, "p.value")
p.LB.raw.m. = sapply(LB.abs.m., `[[`, "p.value") p.LB.abs.m.
Result:
res. = round(cbind(p.LB.raw., p.LB.abs., p.LB.raw.m., p.LB.abs.m.), 2)) (
## p.LB.raw. p.LB.abs. p.LB.raw.m. p.LB.abs.m.
## AA 0.00 0 0.23 0.02
## AXP 0.02 0 0.55 0.07
## T 0.11 0 0.70 0.02
## BA 0.03 0 0.90 0.17
## CAT 0.28 0 0.73 0.07
## C 0.09 0 0.91 0.48
## KO 0.00 0 0.50 0.03
## DD 0.03 0 0.75 0.00
## EK 0.15 0 0.61 0.54
## XOM 0.00 0 0.32 0.22
## GE 0.00 0 0.25 0.09
## GM 0.65 0 0.81 0.27
## HWP 0.09 0 0.21 0.02
## HD 0.00 0 0.00 0.41
## HON 0.44 0 0.07 0.30
## INTC 0.23 0 0.79 0.62
## IBM 0.18 0 0.67 0.28
## IP 0.15 0 0.01 0.09
## JPM 0.52 0 0.43 0.12
## JNJ 0.00 0 0.11 0.91
## MCD 0.28 0 0.72 0.68
## MRK 0.05 0 0.53 0.65
## MSFT 0.28 0 0.19 0.13
## MMM 0.00 0 0.57 0.33
## MO 0.01 0 0.68 0.82
## PG 0.02 0 0.99 0.74
## SBC 0.05 0 0.13 0.00
## UTX 0.00 0 0.12 0.01
## WMT 0.00 0 0.41 0.64
## DIS 0.44 0 0.01 0.51
Differences:
summary(res-res.)
## p.LB.raw p.LB.abs p.LB.raw.m p.LB.abs.m
## Min. :-0.01 Min. :0 Min. :-0.02 Min. :-0.020000000
## 1st Qu.: 0.00 1st Qu.:0 1st Qu.: 0.00 1st Qu.: 0.000000000
## Median : 0.00 Median :0 Median : 0.00 Median : 0.000000000
## Mean : 0.00 Mean :0 Mean : 0.00 Mean :-0.002333333
## 3rd Qu.: 0.00 3rd Qu.:0 3rd Qu.: 0.00 3rd Qu.: 0.000000000
## Max. : 0.01 Max. :0 Max. : 0.01 Max. : 0.010000000
3.3 Univariate stylized facts
Illustrates univariate stylized facts for financial return series.
Univariate stylized facts:
(U1) Return series are not iid although they show little serial correlation;
(U2) Series of absolute (or squared) returns show profound serial correlation;
(U3) Conditional expected returns are close to zero;
(U4) Volatility (conditional standard deviation) appears to vary over time;
(U5) Extreme returns appear in clusters;
(U6) Return series are leptokurtic or heavy-tailed (power-like tail).
library(xts)
library(qrmdata)
library(qrmtools)
3.3.1 Index data
data("SP500")
data("FTSE")
data("SMI")
Build log-returns:
= returns(SP500)
SP500.X = returns(FTSE)
FTSE.X = returns(SMI) SMI.X
Remove zero returns (caused almost certainly by market closure):
= SP500.X[SP500.X != 0]
SP500.X = SMI.X[SMI.X != 0]
SMI.X = FTSE.X[FTSE.X != 0] FTSE.X
Merge and aggregate:
= merge(SP500 = SP500.X, FTSE = FTSE.X, SMI = SMI.X, all = FALSE)
X = apply.weekly(X, FUN = colSums) # weekly log-returns (summing within weeks) X.w
3.3.2 (U1) and (U2)
(U1) Return series are not iid although they show little serial correlation
(U2) Series of absolute (or squared) returns show profound serial correlation
Auto- and crosscorrelations of returns and absolute returns:
acf(X)
acf(abs(X))
acf(X.w)
acf(abs(X.w))
3.3.3 (U3) and (U4)
(U3) Conditional expected returns are close to zero;
(U4) Volatility (conditional standard deviation) appears to vary over time;
Plot of returns:
plot.zoo(X, xlab = "Time", main = "Log-returns") # this (partly) confirms (U3)
Plot of volatilities:
= apply.monthly(X, FUN = function(x) apply(x, 2, sd)) # componentwise volatilities
X.vols plot.zoo(X.vols, xlab = "Time", main = "Volatility estimates")
3.3.4 (U5)
- (U5) Extreme returns appear in clusters
Plot the largest losses (already partly confirms (U5)):
= -SP500.X # consider -log-returns of S&P 500 here
L = 0.01 # probability to determine large losses
r = quantile(L, probs = 1 - r) # determine empirical (1-r)-quantile
u = L[L > u] # exceedances over the chosen empirical quantile = largest so-many losses
xtr.L plot(as.numeric(xtr.L), type = "h", xlab = "Time",
ylab = substitute("Largest"~r.*"% of losses ("*n.~"losses)",
list(r. = 100 * r, n. = length(xtr.L))))
Now consider spacings (certainly not exponentially distributed):
= as.numeric(diff(time(xtr.L)))
spcs qq_plot(spcs, FUN = function(p) qexp(p, rate = r)) # r = exceedance probability
Compare with a Poisson process (simulated iid data):
set.seed(271)
= rt(length(L), df = 3) # simulate iid data from a t_3 distribution
L. = quantile(L., probs = 1 - r) # determine empirical (1-r)-quantile
u. = L.[L. > u.] # exceedances
xtr.L. plot(xtr.L., type = "h", xlab = "Time",
ylab = substitute("Largest"~r.*"% of losses ("*n.~"losses)",
list(r. = 100 * r, n. = length(xtr.L.))))
Sapcings:
= diff(which(L. > u.))
spcs. qq_plot(spcs., FUN = function(p) qexp(p, rate = r))
3.3.5 (U6)
- (U6) Return series are leptokurtic or heavy-tailed (power-like tail)
Show leptokurtosis (heavier tails than a normal) by Q-Q plots against a normal:
layout(t(1:3)) # (1,3)-layout
for (i in 1:3)
qq_plot(X.w[,i], FUN = qnorm, method = "empirical",
# ... to account for the unknown location (mu) and scale (sig) of qnorm()
main = names(X.w)[i]) # equivalent to qqnorm(); qqline()
layout(1) # restore layout
## One can also look at various formal tests
A Q–Q plot (quantile–quantile plot) is a standard visual tool for showing the relationship between empirical quantiles of the data and theoretical quantiles of a reference distribution. A lack of linearity in the Q–Q plot is interpreted as evidence against the hypothesized reference distribution.
The inverted S-shaped curve of the points suggests that the more extreme empirical quantiles of the data tend to be larger than the corresponding quantiles of a normal distribution, indicating that the normal distribution is a poor model for these returns.
3.4 Testing univariate normality
Univariate tests of normality, data application and uniformity of p-values.
library(nortest) # for cvm.test()
library(ADGofTest) # for ad.test()
##
## Attaching package: 'ADGofTest'
## The following object is masked from 'package:nortest':
##
## ad.test
library(moments) # for agostino.test(), jarque.test()
##
## Attaching package: 'moments'
## The following objects are masked from 'package:timeDate':
##
## kurtosis, skewness
library(xts) # for time-series related functions
library(qrmdata)
library(qrmtools)
3.4.1 Generate data
= 1000 # sample size
n = 1 # location
mu = 2 # scale
sig = 3 # degrees of freedom
nu set.seed(271) # set seed (for reproducibility)
= rnorm(n, mean = mu, sd = sig) # sample from N(mu, sig^2)
X.N = mu + sig * rt(n, df = nu) * sqrt((nu-2)/nu) # sample from t_nu(mu, sqrt((nu-2)/nu)*sig^2) (also variance sig^2)
X.t if(FALSE) {
var(X.N)
var(X.t)
## => quite apart, but closer for larger n
}
3.4.2 Testing for \(\mathcal{N}(\mu, \sigma^2)\)
We treat \(\mu\) and \(\sigma^2\) as unknown and estimate them:
= mean(X.N)
mu.N = mean(X.t)
mu.t = sd(X.N)
sig.N = sd(X.t) sig.t
Tests for general distributions (here applied to the normal)
The Kolmogorov–Smirnov test:
is a nonparametric test of the equality of continuous, one-dimensional probability distributions that can be used to compare a sample with a reference probability distribution (one-sample K–S test), or to compare two samples (two-sample K–S test).
can be modified to serve as a goodness of fit test. In the special case of testing for normality of the distribution.
\(H_0\): the distribution function which generated \(X\) is normal distribution with specified parameters.
The Cramer-von Mises test
is an EDF omnibus test for the composite hypothesis of normality.
\(H_0\): the distribution function which generated \(X\) is normal.
The Anderson-Darling goodness of fit test:
checks whether \(X\) is a iid sample from the distribution described by such CDF.
\(H_0\): the distribution function which generated \(X\) is normal distribution with specified parameters.
Applied to normal data (with estimated \(\mu\) and \(\sigma^2\)):
ks = ks.test(X.N, y = "pnorm", mean = mu.N, sd = sig.N)) # Kolmogorov--Smirnov (
##
## One-sample Kolmogorov-Smirnov test
##
## data: X.N
## D = 0.023172117, p-value = 0.6562203
## alternative hypothesis: two-sided
cvm = cvm.test(X.N)) # Cramer--von Mises for normality (
##
## Cramer-von Mises normality test
##
## data: X.N
## W = 0.053604764, p-value = 0.4591003
ad = ad.test(X.N, distr.fun = pnorm, mean = mu.N, sd = sig.N)) # Anderson--Darling (
##
## Anderson-Darling GoF Test
##
## data: X.N and pnorm
## AD = 0.34126087, p-value = 0.9042402
## alternative hypothesis: NA
stopifnot(min(ks$p.value, cvm$p.value, ad$p.value) >= 0.05) # => no rejections
Applied to \(t\) data (with estimated \(\mu\) and \(\sigma^2\)):
ks = ks.test(X.t, y = "pnorm", mean = mu.t, sd = sig.t)) # Kolmogorov--Smirnov (
##
## One-sample Kolmogorov-Smirnov test
##
## data: X.t
## D = 0.14299637, p-value < 2.2204e-16
## alternative hypothesis: two-sided
cvm = cvm.test(X.t)) # Cramer--von Mises for normality (
## Warning in cvm.test(X.t): p-value is smaller than 7.37e-10, cannot be computed
## more accurately
##
## Cramer-von Mises normality test
##
## data: X.t
## W = 8.0937659, p-value = 7.37e-10
ad = ad.test(X.t, distr.fun = pnorm, mean = mu.t, sd = sig.t)) # Anderson--Darling (
##
## Anderson-Darling GoF Test
##
## data: X.t and pnorm
## AD = Inf, p-value = 6e-07
## alternative hypothesis: NA
stopifnot(max(ks$p.value, cvm$p.value, ad$p.value) < 0.05) # => all rejections based on 5%
Tests specifically for the normal distribution
The Shapiro-Wilk test of normality:
- \(H_0\): the distribution function which generated \(X\) is normal.
D’Agostino test for skewness:
Under the hypothesis of normality, data should be symmetrical (i.e. skewness should be equal to zero). This test has such null hypothesis and is useful to detect a significant skewness in normally distributed data.
\(H_0\): data have no skewness.
The Jarque-Bera test for normality:
Under the hypothesis of normality, data should be symmetrical (i.e. skewness should be equal to zero) and have kurtosis chose to three. The Jarque-Bera statistic is chi-square distributed with two degrees of freedom.
\(H_0\): skewness and excess kurtosis equal 0.
Applied to normal data:
sh = shapiro.test(X.N)) # Shapiro--Wilk (
##
## Shapiro-Wilk normality test
##
## data: X.N
## W = 0.99811173, p-value = 0.3326257
ag = agostino.test(X.N)) # D'Agostino's test (
##
## D'Agostino skewness test
##
## data: X.N
## skew = -0.077540707, z = -1.006945762, p-value = 0.3139608
## alternative hypothesis: data have a skewness
jb = jarque.test(X.N)) # Jarque--Bera test (
##
## Jarque-Bera Normality Test
##
## data: X.N
## JB = 1.210698, p-value = 0.5458839
## alternative hypothesis: greater
stopifnot(min(sh$p.value, ag$p.value, jb$p.value) >= 0.05) # => no rejections
Applied to \(t\) data:
sh = shapiro.test(X.t)) # Shapiro--Wilk (
##
## Shapiro-Wilk normality test
##
## data: X.t
## W = 0.60189343, p-value < 2.2204e-16
ag = agostino.test(X.t)) # D'Agostino's test (
##
## D'Agostino skewness test
##
## data: X.t
## skew = -0.61633906, z = -7.41259243, p-value = 1.239009e-13
## alternative hypothesis: data have a skewness
jb = jarque.test(X.t)) # Jarque--Bera test (
##
## Jarque-Bera Normality Test
##
## data: X.t
## JB = 793183.43, p-value < 2.2204e-16
## alternative hypothesis: greater
stopifnot(max(sh$p.value, ag$p.value, jb$p.value) < 0.05) # => all rejections based on 5%
Graphical tests of normality
Note that Q-Q plots are more widely used than P-P plots (probability-probability plots) (as they highlight deviations in the tails more clearly).
Applied to normal data:
pp_plot(X.N, FUN = function(q) pnorm(q, mean = mu.N, sd = sig.N))
qq_plot(X.N, FUN = function(p) qnorm(p, mean = mu.N, sd = sig.N))
Applied to \(t\) data:
pp_plot(X.t, FUN = function(q) pnorm(q, mean = mu.t, sd = sig.t))
qq_plot(X.t, FUN = function(p) qnorm(p, mean = mu.t, sd = sig.t))
## => S-shape => Data seems to come from heavier-tailed distribution than
## the normal distribution.
3.4.3 Data application
Prepare risk factor data:
data(DJ_const) # constituents data
= c("KO", "MSFT", "INTC", "DIS") # margins considered
margin = DJ_const['1993-01-01/2000-12-31', margin]
DJ.const = -returns(DJ.const) # compute -log-returns; daily risk-factor changes
X = list(daily = X, # daily risk-factor changes
X. weekly = apply.weekly(X, FUN = colSums), # weekly risk-factor changes
monthly = apply.monthly(X, FUN = colSums), # monthly risk-factor changes
quarterly = apply.quarterly(X, FUN = colSums)) # quarterly risk-factor changes
3.4.3.1 Are the risk-factor changes normally distributed?
## Formal tests
= sapply(X., function(x) apply(x, 2, function(data) jarque.test(data)$p.value))
pvals < 0.05 pvals
## daily weekly monthly quarterly
## KO TRUE TRUE TRUE TRUE
## MSFT TRUE TRUE TRUE FALSE
## INTC TRUE TRUE TRUE FALSE
## DIS TRUE TRUE FALSE FALSE
## => Daily and weekly risk-factor changes are not (univariate) normal
## Q-Q plots
= c("daily", "weekly", "monthly", "quarterly")
time for(j in seq_len(ncol(X))) { # for all margins, do...
for(k in seq_along(time)) { # for daily, weekly, monthly and quarterly data, do...
qq_plot(X.[[k]][,j], FUN = function(p)
qnorm(p, mean = mean(X.[[k]][,j]), sd = sd(X.[[k]][,j])),
main = paste("Q-Q plot for margin",margin[j],"based on",time[k],"data"))
mtext(paste("p-value:", round(pvals[j,k], 4)), side = 4, line = 1, adj = 0)
} }
3.4.4 Simulation
Under \(H_0\), p-values are uniformly distributed. Let’s check that for \(\mathcal{N}(0,1)\) data.
set.seed(271)
= sapply(1:200, function(b) jarque.test(rnorm(n))$p.value)
pvals.H0 qq_plot(pvals.H0, FUN = qunif)
This does not hold under \(H_1\):
set.seed(271)
= sapply(1:200, function(b) jarque.test(rt(n, df = nu))$p.value)
pvals.H1 qq_plot(pvals.H1, FUN = qunif)
all(pvals.H1 == 0) # all p-values 0
## [1] TRUE
3.5 Multivariate stylized facts
Illustrates multivariate stylized facts for financial return series.
Multivariate stylized facts:
(M1) Return series show little cross-correlation, except for contemporaneous returns;
(M2) Absolute (or squared) returns show profound cross-correlation;
(M3) Correlations between contemporaneous returns vary over time;
(M4) Extreme returns often coincide with extreme returns in (several) other time series.
library(xts)
library(nvmix) # for rNorm(), rStudent()
library(qrmdata)
library(qrmtools)
3.5.1 Index data
data("FTSE")
data("SMI")
Build log-returns and merge:
= returns(FTSE)
FTSE.X = returns(SMI)
SMI.X = merge(FTSE = FTSE.X, SMI = SMI.X, all = FALSE) X
3.5.2 (M1) and (M2)
(M1) Return series show little cross-correlation, except for contemporaneous returns
(M2) Absolute (or squared) returns show profound cross-correlation
Autocorrelations of returns and absolute returns:
acf(X) # raw values
acf(abs(X)) # absolute values
3.5.3 (M3)
- (M3) Correlations between contemporaneous returns vary over time
Plot cross-correlation (and volatilities):
= apply.monthly(X, FUN = cor)[,2] # cross-correlations estimated based on monthly non-overlapping windows
X.cor = apply.monthly(X, FUN = function(x) apply(x, 2, sd)) # componentwise volatilities
X.vols = merge(X.cor, X.vols)
X.cor.vols names(X.cor.vols) = c("Cross-correlation", "Volatility FTSE", "Volatility SMI")
plot.zoo(X.cor.vols, xlab = "Time", main = "Cross-correlation and volatility estimates")
## => Cross-correlation varies over time; below we further investigate the
## relationship between volatility and cross-correlation
Extract volatilities and compute Fisher-transformed cross-correlations:
The Fisher transformation can be used to test hypotheses about the value of the population correlation coefficient \(\rho\) between variables \(X\) and \(Y\). This is because, when the transformation is applied to the sample correlation coefficient, the sampling distribution of the resulting variable is approximately normal, with a variance that is stable over different values of the underlying true correlation.
Fisher’s transformation of \(\rho\) is defined as \[ z = \frac{1}{2}\log\left(\frac{1+\rho}{1-\rho}\right) \]
The Fisher transform is a wellknown variance-stabilizing transform that is appropriate when correlation is the dependent variable in a regression analysis.
= as.numeric(X.vols[,"FTSE"])
FTSE.sig = as.numeric(X.vols[,"SMI"])
SMI.sig = function(r) log((1 + r)/(1 - r))/2 # Fisher (z-)transformation
fisher = fisher(X.cor) # Fisher transform cross correlations rho
Plot Fisher-transformed cross-correlation against volatility with regression lines:
Note: If \((X,Y)\) are jointly normal with correlation \(\rho\), fisher’s transformed sample correlation is approximately \(\mathcal{N}\left(\frac{1}{2}\log\left(\frac{1+\rho}{1-\rho}\right), \frac{1}{n-3}\right)\) distributed (\(n\) = sample size).
## FTSE
plot(FTSE.sig, rho, xlab = "Estimated volatility", ylab = "Estimated cross-correlation")
= lm(rho ~ FTSE.sig)
reg summary(reg)
##
## Call:
## lm(formula = rho ~ FTSE.sig)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14689484 -0.23532132 0.00702075 0.27898787 0.88317510
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.58617194 0.04376356 13.39407 < 2.22e-16 ***
## FTSE.sig 36.37913535 3.89888949 9.33064 < 2.22e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3624633 on 300 degrees of freedom
## Multiple R-squared: 0.2249281, Adjusted R-squared: 0.2223445
## F-statistic: 87.06085 on 1 and 300 DF, p-value: < 2.2204e-16
abline(reg)
## SMI
plot(SMI.sig, rho, xlab = "Estimated volatility", ylab = "Estimated cross-correlation")
= lm(rho ~ SMI.sig)
reg summary(reg)
##
## Call:
## lm(formula = rho ~ SMI.sig)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.61462953 -0.25011470 0.00568859 0.25887690 0.91906510
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.63948644 0.04474835 14.29073 < 2.22e-16 ***
## SMI.sig 30.00745100 3.84716708 7.79988 1.0329e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3754027 on 300 degrees of freedom
## Multiple R-squared: 0.1686024, Adjusted R-squared: 0.165831
## F-statistic: 60.83817 on 1 and 300 DF, p-value: 1.03295e-13
abline(reg)
EXERCISE: Now try and compare with these SWN (strict white noise) data:
set.seed(271)
= xts(rStudent(n = nrow(X), df = 3, scale = cor(X)), time(X))
X.t = xts(rNorm(n = nrow(X), scale = cor(X)), time(X)) X.N
3.5.4 (M4)
- (M4) Extreme returns often coincide with extreme returns in (several) other time series
plot.zoo(X, xlab = "Time", main = "Log-returns") # seems like extremes occur together
= apply(X, 2, rank) / (nrow(X) + 1)
X. plot(X., main = "Componentwise scaled ranks") # now better visible (bottom-left, top-right corners)
## Note: More on that ("pseudo-observations") in Chapter 7