Chapter 11 Exploratory analysis of time series data: Examples

11.1 Example: Monthly electricity sales for Virginia

11.1.1 Extract data from remote database

esales <- dbGetQuery(db,'SELECT * from eia_elec_sales_va_all_m') # SQL code to retrieve data from a table in the remote database
# str(esales)
esales <- as_tibble(esales) # Convert dataframe to a 'tibble' for tidyverse work
# str(esales)

# Reference: https://arrow.apache.org/docs/r/
# if(!('arrow' %in% installed.packages())) install.packages('arrow')
library(arrow)
write_feather(esales, "data/esales.feather")

# Close connection -- this is good practice
dbDisconnect(db)
dbUnloadDriver(db_driver)

11.1.2 Briefly characterize the dataset

library(arrow)
esales <- read_feather("data/esales.feather")

print(esales)    # print the data as a table

## # A tibble: 233 x 4
##     value date        year month
##     <dbl> <date>     <int> <int>
##  1  8282. 2020-05-01  2020     5
##  2  7839. 2020-04-01  2020     4
##  3  8889. 2020-03-01  2020     3
##  4  9368. 2020-02-01  2020     2
##  5  9209. 2020-01-01  2020     1
##  6 10038. 2019-12-01  2019    12
##  7  9291. 2019-11-01  2019    11
##  8  8757. 2019-10-01  2019    10
##  9  9874. 2019-09-01  2019     9
## 10 10912. 2019-08-01  2019     8
## # … with 223 more rows

summary(esales)  # compute basic summary statistics about the data

##      value            date                 year          month       
##  Min.   : 7153   Min.   :2001-01-01   Min.   :2001   Min.   : 1.000  
##  1st Qu.: 8200   1st Qu.:2005-11-01   1st Qu.:2005   1st Qu.: 3.000  
##  Median : 9019   Median :2010-09-01   Median :2010   Median : 6.000  
##  Mean   : 9093   Mean   :2010-08-31   Mean   :2010   Mean   : 6.425  
##  3rd Qu.: 9885   3rd Qu.:2015-07-01   3rd Qu.:2015   3rd Qu.: 9.000  
##  Max.   :11724   Max.   :2020-05-01   Max.   :2020   Max.   :12.000

boxplot(esales)

# References: https://www.tidyverse.org/, https://dplyr.tidyverse.org/
# filter(data object, condition) : syntax for filter() command

esales %>%
  filter(year == 2019) %>%
  filter(value > 9000) %>%
  print()

(esales %>%
  group_by(year) %>%
  summarise(Total = sum(value)) -> total_esales_by_year)

esales %>%
  mutate(sales_TWh = value/1000) %>%
  dplyr::select(-value)

11.1.3 Plot the time series

#Reference: https://ggplot2.tidyverse.org/

ggplot(data=esales, aes(x=date,y=value)) +
  geom_line() + xlab("Year") + ylab("Virginia monthly total electricity sales (GWh)")

# library(lubridate) # Make it easy to deal with dates

esales %>% filter(month==3)                   # These three lines of code

## # A tibble: 20 x 4
##    value date        year month
##    <dbl> <date>     <int> <int>
##  1 8889. 2020-03-01  2020     3
##  2 9466. 2019-03-01  2019     3
##  3 9666. 2018-03-01  2018     3
##  4 9372. 2017-03-01  2017     3
##  5 8406. 2016-03-01  2016     3
##  6 9435. 2015-03-01  2015     3
##  7 9676. 2014-03-01  2014     3
##  8 9506. 2013-03-01  2013     3
##  9 8086. 2012-03-01  2012     3
## 10 8688. 2011-03-01  2011     3
## 11 8568. 2010-03-01  2010     3
## 12 8926. 2009-03-01  2009     3
## 13 8512. 2008-03-01  2008     3
## 14 8632. 2007-03-01  2007     3
## 15 8519. 2006-03-01  2006     3
## 16 9125. 2005-03-01  2005     3
## 17 8136. 2004-03-01  2004     3
## 18 8108. 2003-03-01  2003     3
## 19 7675. 2002-03-01  2002     3
## 20 8070. 2001-03-01  2001     3

esales %>% filter(month(date)==3)             #   all do

## # A tibble: 20 x 4
##    value date        year month
##    <dbl> <date>     <int> <int>
##  1 8889. 2020-03-01  2020     3
##  2 9466. 2019-03-01  2019     3
##  3 9666. 2018-03-01  2018     3
##  4 9372. 2017-03-01  2017     3
##  5 8406. 2016-03-01  2016     3
##  6 9435. 2015-03-01  2015     3
##  7 9676. 2014-03-01  2014     3
##  8 9506. 2013-03-01  2013     3
##  9 8086. 2012-03-01  2012     3
## 10 8688. 2011-03-01  2011     3
## 11 8568. 2010-03-01  2010     3
## 12 8926. 2009-03-01  2009     3
## 13 8512. 2008-03-01  2008     3
## 14 8632. 2007-03-01  2007     3
## 15 8519. 2006-03-01  2006     3
## 16 9125. 2005-03-01  2005     3
## 17 8136. 2004-03-01  2004     3
## 18 8108. 2003-03-01  2003     3
## 19 7675. 2002-03-01  2002     3
## 20 8070. 2001-03-01  2001     3

esales %>% filter(lubridate::month(date)==3)  #   the same thing.

## # A tibble: 20 x 4
##    value date        year month
##    <dbl> <date>     <int> <int>
##  1 8889. 2020-03-01  2020     3
##  2 9466. 2019-03-01  2019     3
##  3 9666. 2018-03-01  2018     3
##  4 9372. 2017-03-01  2017     3
##  5 8406. 2016-03-01  2016     3
##  6 9435. 2015-03-01  2015     3
##  7 9676. 2014-03-01  2014     3
##  8 9506. 2013-03-01  2013     3
##  9 8086. 2012-03-01  2012     3
## 10 8688. 2011-03-01  2011     3
## 11 8568. 2010-03-01  2010     3
## 12 8926. 2009-03-01  2009     3
## 13 8512. 2008-03-01  2008     3
## 14 8632. 2007-03-01  2007     3
## 15 8519. 2006-03-01  2006     3
## 16 9125. 2005-03-01  2005     3
## 17 8136. 2004-03-01  2004     3
## 18 8108. 2003-03-01  2003     3
## 19 7675. 2002-03-01  2002     3
## 20 8070. 2001-03-01  2001     3

# We don't have to keep the 'year' and 'month' column: can recover them if needed

esales %>%
  dplyr::select(date, sales_GWh = value) -> esales_tbl

print(esales_tbl)

## # A tibble: 233 x 2
##    date       sales_GWh
##    <date>         <dbl>
##  1 2020-05-01     8282.
##  2 2020-04-01     7839.
##  3 2020-03-01     8889.
##  4 2020-02-01     9368.
##  5 2020-01-01     9209.
##  6 2019-12-01    10038.
##  7 2019-11-01     9291.
##  8 2019-10-01     8757.
##  9 2019-09-01     9874.
## 10 2019-08-01    10912.
## # … with 223 more rows

11.1.4 Convert the data frame into a time series `tsibble` object

# install.packages("tsibble")
library(tsibble) # Reference: https://tsibble.tidyverts.org/articles/intro-tsibble.html

esales_tbl %>% as_tsibble(index = date) -> elsales_tbl_ts

print(elsales_tbl_ts)

## # A tsibble: 233 x 2 [1D]
##    date       sales_GWh
##    <date>         <dbl>
##  1 2001-01-01     9576.
##  2 2001-02-01     7820.
##  3 2001-03-01     8070.
##  4 2001-04-01     7153.
##  5 2001-05-01     7224.
##  6 2001-06-01     8264.
##  7 2001-07-01     8896.
##  8 2001-08-01     9404.
##  9 2001-09-01     7753.
## 10 2001-10-01     7272.
## # … with 223 more rows

11.1.5 Make some plots

11.1.5.1 Make a histogram of monthly sales

hist(elsales_tbl_ts$sales_GWh, breaks=40)

11.1.5.2 Make a seasonal plot

# This plot won't work. Why not?
elsales_tbl_ts %>%
  feasts::gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")

# install.packages("feasts"), Reference: https://feasts.tidyverts.org/
library(feasts)

elsales_tbl_ts %>%
  mutate(Month = tsibble::yearmonth(date)) %>%
  as_tsibble(index = Month) %>%
  dplyr::select(Month,sales_GWh) -> vaelsales_tbl_ts

print(vaelsales_tbl_ts)

## # A tsibble: 233 x 2 [1M]
##       Month sales_GWh
##       <mth>     <dbl>
##  1 2001 Jan     9576.
##  2 2001 Feb     7820.
##  3 2001 Mar     8070.
##  4 2001 Apr     7153.
##  5 2001 May     7224.
##  6 2001 Jun     8264.
##  7 2001 Jul     8896.
##  8 2001 Aug     9404.
##  9 2001 Sep     7753.
## 10 2001 Oct     7272.
## # … with 223 more rows

autoplot(vaelsales_tbl_ts, sales_GWh) +
  ylab("Virginia monthly total electricity sales (GWh)") +
  xlab("")  # Leave horiz. axis label blank

### Seasonal plots and seasonal subseries plots

vaelsales_tbl_ts %>% gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")

vaelsales_tbl_ts %>%
  gg_subseries(sales_GWh) +
  labs(
    y = "Sales (GWh)",
    title = "Seasonal subseries plot: Virginia electricity sales"
  )

11.1.6 Scatterplots

Investigating relationships between two variables. Scatterplots. Correlation. Scatterplot matrices.

Readings: FPP Sect. 2.6

vic_elec

## # A tsibble: 52,608 x 5 [30m] <Australia/Melbourne>
##    Time                Demand Temperature Date       Holiday
##    <dttm>               <dbl>       <dbl> <date>     <lgl>  
##  1 2012-01-01 00:00:00  4383.        21.4 2012-01-01 TRUE   
##  2 2012-01-01 00:30:00  4263.        21.0 2012-01-01 TRUE   
##  3 2012-01-01 01:00:00  4049.        20.7 2012-01-01 TRUE   
##  4 2012-01-01 01:30:00  3878.        20.6 2012-01-01 TRUE   
##  5 2012-01-01 02:00:00  4036.        20.4 2012-01-01 TRUE   
##  6 2012-01-01 02:30:00  3866.        20.2 2012-01-01 TRUE   
##  7 2012-01-01 03:00:00  3694.        20.1 2012-01-01 TRUE   
##  8 2012-01-01 03:30:00  3562.        19.6 2012-01-01 TRUE   
##  9 2012-01-01 04:00:00  3433.        19.1 2012-01-01 TRUE   
## 10 2012-01-01 04:30:00  3359.        19.0 2012-01-01 TRUE   
## # … with 52,598 more rows

summary(vic_elec)

##       Time                         Demand      Temperature   
##  Min.   :2012-01-01 00:00:00   Min.   :2858   Min.   : 1.50  
##  1st Qu.:2012-09-30 22:52:30   1st Qu.:3969   1st Qu.:12.30  
##  Median :2013-07-01 22:45:00   Median :4635   Median :15.40  
##  Mean   :2013-07-01 22:45:00   Mean   :4665   Mean   :16.27  
##  3rd Qu.:2014-04-01 23:37:30   3rd Qu.:5244   3rd Qu.:19.40  
##  Max.   :2014-12-31 23:30:00   Max.   :9345   Max.   :43.20  
##       Date             Holiday       
##  Min.   :2012-01-01   Mode :logical  
##  1st Qu.:2012-09-30   FALSE:51120    
##  Median :2013-07-01   TRUE :1488     
##  Mean   :2013-07-01                  
##  3rd Qu.:2014-04-01                  
##  Max.   :2014-12-31

vic_elec %>%
  filter(year(Time) == 2013) %>%
  autoplot(Demand) +
  labs(
    y = "Demand (GW)",
    title = "Half-hourly electricity demand: Victoria"
  )

vic_elec %>%
  filter(year(Time) == 2013) %>%
  autoplot(Temperature) +
  labs(
    y = "Temperature (degrees Celsius)",
    title = "Half-hourly temperatures: Melbourne, Australia"
  )

vic_elec %>%
  filter(year(Time) == 2013) %>%
  ggplot(aes(x = Temperature, y = Demand)) +
#  geom_density2d() +
  geom_point(size=0.1, aes(colour=Holiday), alpha = 0.4) +
  labs(y = "Demand (GW)", x = "Temperature (degrees Celsius)")

A Scatterplot matrix

vic_elec

## # A tsibble: 52,608 x 5 [30m] <Australia/Melbourne>
##    Time                Demand Temperature Date       Holiday
##    <dttm>               <dbl>       <dbl> <date>     <lgl>  
##  1 2012-01-01 00:00:00  4383.        21.4 2012-01-01 TRUE   
##  2 2012-01-01 00:30:00  4263.        21.0 2012-01-01 TRUE   
##  3 2012-01-01 01:00:00  4049.        20.7 2012-01-01 TRUE   
##  4 2012-01-01 01:30:00  3878.        20.6 2012-01-01 TRUE   
##  5 2012-01-01 02:00:00  4036.        20.4 2012-01-01 TRUE   
##  6 2012-01-01 02:30:00  3866.        20.2 2012-01-01 TRUE   
##  7 2012-01-01 03:00:00  3694.        20.1 2012-01-01 TRUE   
##  8 2012-01-01 03:30:00  3562.        19.6 2012-01-01 TRUE   
##  9 2012-01-01 04:00:00  3433.        19.1 2012-01-01 TRUE   
## 10 2012-01-01 04:30:00  3359.        19.0 2012-01-01 TRUE   
## # … with 52,598 more rows

boxplot(vic_elec$Temperature)

# install.packages("GGally")
vic_elec %>%
  # mutate(Temperature = round(Temperature)) %>%
  # pivot_wider(values_from=c(Demand,Temperature), names_from=Holiday) %>%
  GGally::ggpairs(columns = 3:2)

vic_elec %>%
  mutate(Year = factor(year(Date))) %>%
  dplyr::select(-c(Date, Holiday)) %>%
  GGally::ggpairs(columns = 4:2)

11.2 Example: Australian production

# install.packages('tsibbledata')
library(tsibbledata)

aus_production

## # A tsibble: 218 x 7 [1Q]
##    Quarter  Beer Tobacco Bricks Cement Electricity   Gas
##      <qtr> <dbl>   <dbl>  <dbl>  <dbl>       <dbl> <dbl>
##  1 1956 Q1   284    5225    189    465        3923     5
##  2 1956 Q2   213    5178    204    532        4436     6
##  3 1956 Q3   227    5297    208    561        4806     7
##  4 1956 Q4   308    5681    197    570        4418     6
##  5 1957 Q1   262    5577    187    529        4339     5
##  6 1957 Q2   228    5651    214    604        4811     7
##  7 1957 Q3   236    5317    227    603        5259     7
##  8 1957 Q4   320    6152    222    582        4735     6
##  9 1958 Q1   272    5758    199    554        4608     5
## 10 1958 Q2   233    5641    229    620        5196     7
## # … with 208 more rows

aus_production %>% gg_season(Electricity)

aus_production %>% gg_season(Beer)

11.3 Example: Gross Domestic Product data

11.3.1 Exploratory data analysis

library(tsibbledata) # Data sets package

print(global_economy)

## # A tsibble: 15,150 x 9 [1Y]
## # Key:       Country [263]
##    Country     Code   Year         GDP Growth   CPI Imports Exports Population
##    <fct>       <fct> <dbl>       <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
##  1 Afghanistan AFG    1960  537777811.     NA    NA    7.02    4.13    8996351
##  2 Afghanistan AFG    1961  548888896.     NA    NA    8.10    4.45    9166764
##  3 Afghanistan AFG    1962  546666678.     NA    NA    9.35    4.88    9345868
##  4 Afghanistan AFG    1963  751111191.     NA    NA   16.9     9.17    9533954
##  5 Afghanistan AFG    1964  800000044.     NA    NA   18.1     8.89    9731361
##  6 Afghanistan AFG    1965 1006666638.     NA    NA   21.4    11.3     9938414
##  7 Afghanistan AFG    1966 1399999967.     NA    NA   18.6     8.57   10152331
##  8 Afghanistan AFG    1967 1673333418.     NA    NA   14.2     6.77   10372630
##  9 Afghanistan AFG    1968 1373333367.     NA    NA   15.2     8.90   10604346
## 10 Afghanistan AFG    1969 1408888922.     NA    NA   15.0    10.1    10854428
## # … with 15,140 more rows

global_economy %>% filter(Country=="Sweden") %>% print()

## # A tsibble: 58 x 9 [1Y]
## # Key:       Country [1]
##    Country Code   Year          GDP Growth   CPI Imports Exports Population
##    <fct>   <fct> <dbl>        <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
##  1 Sweden  SWE    1960 14842870293.  NA     9.21    23.4    23.0    7484656
##  2 Sweden  SWE    1961 16147160123.   5.68  9.41    21.7    22.3    7519998
##  3 Sweden  SWE    1962 17511477311.   4.26  9.86    21.4    21.9    7561588
##  4 Sweden  SWE    1963 18954132366.   5.33 10.1     21.5    21.9    7604328
##  5 Sweden  SWE    1964 21137242561.   6.82 10.5     21.9    22.3    7661354
##  6 Sweden  SWE    1965 23260320646.   3.82 11.0     22.5    21.9    7733853
##  7 Sweden  SWE    1966 25302033132.   2.09 11.7     21.9    21.4    7807797
##  8 Sweden  SWE    1967 27463409202.   3.37 12.2     21.0    21.1    7867931
##  9 Sweden  SWE    1968 29143383491.   3.64 12.5     21.6    21.6    7912273
## 10 Sweden  SWE    1969 31649203886.   5.01 12.8     23.0    22.8    7968072
## # … with 48 more rows

global_economy %>%
  filter(Country=="Sweden") %>%
  autoplot(GDP) +
  ggtitle("GDP for Sweden") + ylab("$US billions")

11.3.2 Fitting data to simple models

global_economy %>% model(trend_model = TSLM(GDP ~ trend())) -> fit

fit

## # A mable: 263 x 2
## # Key:     Country [263]
##    Country             trend_model
##    <fct>                   <model>
##  1 Afghanistan              <TSLM>
##  2 Albania                  <TSLM>
##  3 Algeria                  <TSLM>
##  4 American Samoa           <TSLM>
##  5 Andorra                  <TSLM>
##  6 Angola                   <TSLM>
##  7 Antigua and Barbuda      <TSLM>
##  8 Arab World               <TSLM>
##  9 Argentina                <TSLM>
## 10 Armenia                  <TSLM>
## # … with 253 more rows

fit %>% filter(Country == "Sweden") %>% residuals()

## # A tsibble: 58 x 4 [1Y]
## # Key:       Country, .model [1]
##    Country .model       Year       .resid
##    <fct>   <chr>       <dbl>        <dbl>
##  1 Sweden  trend_model  1960 79973991821.
##  2 Sweden  trend_model  1961 71110300270.
##  3 Sweden  trend_model  1962 62306636078.
##  4 Sweden  trend_model  1963 53581309752.
##  5 Sweden  trend_model  1964 45596438566.
##  6 Sweden  trend_model  1965 37551535271.
##  7 Sweden  trend_model  1966 29425266377.
##  8 Sweden  trend_model  1967 21418661066.
##  9 Sweden  trend_model  1968 12930653974.
## 10 Sweden  trend_model  1969  5268492989.
## # … with 48 more rows

fit %>% filter(Country == "Sweden") %>% residuals() %>% autoplot(.resid)

11.3.3 Work with ln(GDP)

global_economy %>%
  filter(Country=="Sweden") %>%
  autoplot(log(GDP)) +
  ggtitle("ln(GDP) for Sweden") + ylab("$US billions")

global_economy %>%
  model(trend_model = TSLM(log(GDP) ~ trend())) -> logfit

logfit %>% filter(Country == "Sweden") %>% residuals() %>% autoplot()

global_economy %>% model(trend_model = TSLM(log(GDP) ~ log(Population))) -> fit3

fit3 %>% filter(Country == "Sweden") %>% residuals() %>% autoplot()

11.4 Producing forecasts

fit %>% forecast(h = "3 years") -> fcast3yrs

fcast3yrs

## # A fable: 789 x 5 [1Y]
## # Key:     Country, .model [263]
##    Country        .model       Year                 GDP         .mean
##    <fct>          <chr>       <dbl>              <dist>         <dbl>
##  1 Afghanistan    trend_model  2018 N(1.6e+10, 1.3e+19)  16205101654.
##  2 Afghanistan    trend_model  2019 N(1.7e+10, 1.3e+19)  16511878141.
##  3 Afghanistan    trend_model  2020 N(1.7e+10, 1.3e+19)  16818654627.
##  4 Albania        trend_model  2018 N(1.4e+10, 3.9e+18)  13733734164.
##  5 Albania        trend_model  2019 N(1.4e+10, 3.9e+18)  14166852711.
##  6 Albania        trend_model  2020 N(1.5e+10, 3.9e+18)  14599971258.
##  7 Algeria        trend_model  2018 N(1.6e+11, 9.4e+20) 157895153441.
##  8 Algeria        trend_model  2019 N(1.6e+11, 9.4e+20) 161100952126.
##  9 Algeria        trend_model  2020 N(1.6e+11, 9.4e+20) 164306750811.
## 10 American Samoa trend_model  2018 N(6.8e+08, 1.7e+15)    682475000 
## # … with 779 more rows

fcast3yrs %>% filter(Country == "Sweden", Year == 2020) %>% str()

## fable [1 × 5] (S3: fbl_ts/tbl_ts/tbl_df/tbl/data.frame)
##  $ Country: Factor w/ 263 levels "Afghanistan",..: 232
##  $ .model : chr "trend_model"
##  $ Year   : num 2020
##  $ GDP    : dist [1:1] 
##   ..$ 3:List of 2
##   .. ..$ mu   : num 5.45e+11
##   .. ..$ sigma: num 5.34e+10
##   .. ..- attr(*, "class")= chr [1:2] "dist_normal" "dist_default"
##   ..@ vars: chr "GDP"
##  $ .mean  : num 5.45e+11
##  - attr(*, "key")= tibble [1 × 3] (S3: tbl_df/tbl/data.frame)
##   ..$ Country: Factor w/ 263 levels "Afghanistan",..: 232
##   ..$ .model : chr "trend_model"
##   ..$ .rows  : list<int> [1:1] 
##   .. ..$ : int 1
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
##  - attr(*, "index")= chr "Year"
##   ..- attr(*, "ordered")= logi TRUE
##  - attr(*, "index2")= chr "Year"
##  - attr(*, "interval")= interval [1:1] 1Y
##   ..@ .regular: logi TRUE
##  - attr(*, "response")= chr "GDP"
##  - attr(*, "dist")= chr "GDP"
##  - attr(*, "model_cn")= chr ".model"

fcast3yrs %>%
  filter(Country=="Sweden") %>%
  autoplot(global_economy) +
  ggtitle("GDP for Sweden") + ylab("$US billions")

11.4.1 Model residuals vs. forecast errors

Model residuals:

Your data: $y_1, y_2, \ldots, y_T$

Fitted values: $\hat{y}_1, \hat{y}_2, \ldots, \hat{y}_T$

Model residuals: $e_t = y_t - \hat{y}_t$

Forecast errors:

augment(fit)

## # A tsibble: 15,150 x 7 [1Y]
## # Key:       Country, .model [263]
##    Country     .model       Year         GDP      .fitted      .resid     .innov
##    <fct>       <chr>       <dbl>       <dbl>        <dbl>       <dbl>      <dbl>
##  1 Afghanistan trend_model  1960  537777811. -1587934559. 2125712370.     2.13e9
##  2 Afghanistan trend_model  1961  548888896. -1281158073. 1830046968.     1.83e9
##  3 Afghanistan trend_model  1962  546666678.  -974381586. 1521048264.     1.52e9
##  4 Afghanistan trend_model  1963  751111191.  -667605100. 1418716291.     1.42e9
##  5 Afghanistan trend_model  1964  800000044.  -360828613. 1160828658.     1.16e9
##  6 Afghanistan trend_model  1965 1006666638.   -54052127. 1060718765.     1.06e9
##  7 Afghanistan trend_model  1966 1399999967.   252724359. 1147275607.     1.15e9
##  8 Afghanistan trend_model  1967 1673333418.   559500846. 1113832572.     1.11e9
##  9 Afghanistan trend_model  1968 1373333367.   866277332.  507056034.     5.07e8
## 10 Afghanistan trend_model  1969 1408888922.  1173053819.  235835103.     2.36e8
## # … with 15,140 more rows

augment(fit) %>% filter(Country == "Sweden") %>%
  ggplot(aes(x = .resid)) +
  geom_histogram(bins = 20) +
  ggtitle("Histogram of residuals")

11.4.2 Are the model residuals auto-correlated?

augment(fit) %>% filter(Country == "Sweden") -> augSweden

augSweden %>%
  ACF(.resid) %>%
  autoplot() + ggtitle("ACF of residuals")

augment(fit3) %>% filter(Country == "Sweden") -> augSweden3

augSweden3 %>%
  ACF(.resid) %>%
  autoplot() + ggtitle("ACF of residuals")

11.5 Example: GDP, several countries

library(tsibbledata) # Data sets package

nordic <- c("Sweden", "Denmark", "Norway", "Finland")

(global_economy %>% filter(Country %in% nordic) -> nordic_economy)

## # A tsibble: 232 x 9 [1Y]
## # Key:       Country [4]
##    Country Code   Year          GDP Growth   CPI Imports Exports Population
##    <fct>   <fct> <dbl>        <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
##  1 Denmark DNK    1960  6248946880. NA      8.25    34.3    32.3    4579603
##  2 Denmark DNK    1961  6933842099.  6.38   8.53    32.3    30.0    4611687
##  3 Denmark DNK    1962  7812968114.  5.67   9.16    32.5    28.6    4647727
##  4 Denmark DNK    1963  8316692386.  0.637  9.72    30.8    30.4    4684483
##  5 Denmark DNK    1964  9506678763.  9.27  10.0     32.6    29.9    4722072
##  6 Denmark DNK    1965 10678897387.  4.56  10.6     31.5    29.3    4759012
##  7 Denmark DNK    1966 11721248101.  2.74  11.3     30.8    28.6    4797381
##  8 Denmark DNK    1967 12788479692.  3.42  12.2     30.0    27.3    4835354
##  9 Denmark DNK    1968 13196541952   3.97  13.2     29.7    27.7    4864883
## 10 Denmark DNK    1969 15009384585.  6.32  13.7     30.4    27.6    4891860
## # … with 222 more rows

nordic_economy %>% autoplot(GDP)

fitnord <- nordic_economy %>%
  model(
    trend_model = TSLM(GDP ~ trend()),
    trend_model_ln = TSLM(log(GDP) ~ trend()),
    ets = ETS(GDP ~ trend("A")),
    arima = ARIMA(GDP)
  )

fitnord

## # A mable: 4 x 5
## # Key:     Country [4]
##   Country trend_model trend_model_ln          ets          arima
##   <fct>       <model>        <model>      <model>        <model>
## 1 Denmark      <TSLM>         <TSLM> <ETS(M,A,N)> <ARIMA(1,1,1)>
## 2 Finland      <TSLM>         <TSLM> <ETS(M,A,N)> <ARIMA(0,1,2)>
## 3 Norway       <TSLM>         <TSLM> <ETS(M,A,N)> <ARIMA(0,1,1)>
## 4 Sweden       <TSLM>         <TSLM> <ETS(M,A,N)> <ARIMA(0,1,2)>

fitnord %>%
  dplyr::select(arima) %>%
  coef()

## # A tibble: 7 x 7
##   Country .model term  estimate std.error statistic    p.value
##   <fct>   <chr>  <chr>    <dbl>     <dbl>     <dbl>      <dbl>
## 1 Denmark arima  ar1     -0.390     0.206     -1.89 0.0636    
## 2 Denmark arima  ma1      0.724     0.143      5.05 0.00000484
## 3 Finland arima  ma1      0.406     0.120      3.39 0.00126   
## 4 Finland arima  ma2     -0.221     0.108     -2.05 0.0450    
## 5 Norway  arima  ma1      0.410     0.155      2.65 0.0104    
## 6 Sweden  arima  ma1      0.241     0.121      1.99 0.0510    
## 7 Sweden  arima  ma2     -0.188     0.101     -1.87 0.0670

Denmark: ARMA(1,1)

Finland: MA(2)

Norway: MA(1)

Sweden: MA(2)

nordic_economy %>%
  model(arima_constrained = ARIMA(GDP ~ pdq(1,0,2))) %>% dplyr::select(arima_constrained) %>% coef()

## # A tibble: 0 x 4
## # … with 4 variables: Country <fct>, .model <chr>, term <chr>, estimate <dbl>

fitnord %>% coef()

## # A tibble: 39 x 7
##    Country .model         term         estimate std.error statistic   p.value
##    <fct>   <chr>          <chr>           <dbl>     <dbl>     <dbl>     <dbl>
##  1 Denmark trend_model    (Intercept) -5.65e+10   8.75e+9     -6.46  2.70e- 8
##  2 Denmark trend_model    trend()      6.63e+ 9   2.58e+8     25.7   1.14e-32
##  3 Denmark trend_model_ln (Intercept)  2.30e+ 1   8.55e-2    269.    7.68e-89
##  4 Denmark trend_model_ln trend()      7.12e- 2   2.52e-3     28.3   7.68e-35
##  5 Denmark ets            alpha        1.00e+ 0  NA           NA    NA       
##  6 Denmark ets            beta         3.67e- 1  NA           NA    NA       
##  7 Denmark ets            l            4.92e+ 9  NA           NA    NA       
##  8 Denmark ets            b            1.24e+ 9  NA           NA    NA       
##  9 Denmark arima          ar1         -3.90e- 1   2.06e-1     -1.89  6.36e- 2
## 10 Denmark arima          ma1          7.24e- 1   1.43e-1      5.05  4.84e- 6
## # … with 29 more rows

fitnord %>%  glance()

## # A tibble: 16 x 21
##    Country .model r_squared adj_r_squared   sigma2 statistic   p_value    df
##    <fct>   <chr>      <dbl>         <dbl>    <dbl>     <dbl>     <dbl> <int>
##  1 Denmark trend…     0.922         0.920 1.08e+21      660.  1.14e-32     2
##  2 Denmark trend…     0.935         0.933 1.03e- 1      800.  7.68e-35     2
##  3 Denmark ets       NA            NA     1.04e- 2       NA  NA           NA
##  4 Denmark arima     NA            NA     2.41e+20       NA  NA           NA
##  5 Finland trend…     0.914         0.912 7.34e+20      594.  1.70e-31     2
##  6 Finland trend…     0.930         0.929 1.14e- 1      745.  4.96e-34     2
##  7 Finland ets       NA            NA     1.32e- 2       NA  NA           NA
##  8 Finland arima     NA            NA     1.89e+20       NA  NA           NA
##  9 Norway  trend…     0.824         0.821 4.60e+21      262.  8.54e-23     2
## 10 Norway  trend…     0.959         0.958 8.37e- 2     1307.  1.64e-40     2
## 11 Norway  ets       NA            NA     8.23e- 3       NA  NA           NA
## 12 Norway  arima     NA            NA     6.78e+20       NA  NA           NA
## 13 Sweden  trend…     0.919         0.918 2.65e+21      635.  3.07e-32     2
## 14 Sweden  trend…     0.935         0.933 8.19e- 2      800.  7.57e-35     2
## 15 Sweden  ets       NA            NA     1.16e- 2       NA  NA           NA
## 16 Sweden  arima     NA            NA     8.84e+20       NA  NA           NA
## # … with 13 more variables: log_lik <dbl>, AIC <dbl>, AICc <dbl>, BIC <dbl>,
## #   CV <dbl>, deviance <dbl>, df.residual <int>, rank <int>, MSE <dbl>,
## #   AMSE <dbl>, MAE <dbl>, ar_roots <list>, ma_roots <list>

fitnord %>% filter(Country == "Denmark") %>% dplyr::select(arima) %>% report()

## Series: GDP 
## Model: ARIMA(1,1,1) 
## 
## Coefficients:
##           ar1     ma1
##       -0.3898  0.7240
## s.e.   0.2061  0.1434
## 
## sigma^2 estimated as 2.407e+20:  log likelihood=-1417.5
## AIC=2840.99   AICc=2841.45   BIC=2847.12

fitnord %>%
  accuracy() %>%
  arrange(Country, MPE)

## # A tibble: 16 x 11
##    Country .model .type        ME    RMSE     MAE     MPE   MAPE  MASE RMSSE
##    <fct>   <chr>  <chr>     <dbl>   <dbl>   <dbl>   <dbl>  <dbl> <dbl> <dbl>
##  1 Denmark trend… Trai… -1.12e+10 6.89e10 3.67e10  -5.17   28.0  3.34  4.24 
##  2 Denmark ets    Trai…  4.50e+ 7 1.65e10 1.04e10   0.518   7.09 0.946 1.02 
##  3 Denmark arima  Trai…  4.40e+ 9 1.51e10 1.04e10   5.05    8.16 0.945 0.930
##  4 Denmark trend… Trai… -2.20e- 6 3.23e10 2.63e10  51.1    80.8  2.40  1.99 
##  5 Finland trend… Trai… -8.61e+ 9 5.64e10 2.99e10  -5.53   28.6  2.95  3.82 
##  6 Finland ets    Trai…  1.36e+ 8 1.47e10 9.41e 9   0.795   8.36 0.927 0.996
##  7 Finland arima  Trai…  3.54e+ 9 1.34e10 9.14e 9   5.03    8.92 0.900 0.906
##  8 Finland trend… Trai…  2.33e- 6 2.66e10 2.21e10  46.1    80.5  2.18  1.80 
##  9 Norway  trend… Trai… -1.31e+10 8.20e10 3.51e10  -4.24   24.9  2.24  3.01 
## 10 Norway  ets    Trai… -5.29e+ 8 2.75e10 1.37e10   0.755   6.94 0.870 1.01 
## 11 Norway  arima  Trai…  4.90e+ 9 2.56e10 1.40e10   5.04    8.11 0.890 0.938
## 12 Norway  trend… Trai… -1.10e- 5 6.67e10 5.48e10 130.    181.   3.49  2.45 
## 13 Sweden  trend… Trai… -1.18e+10 8.23e10 4.79e10  -3.96   23.7  2.25  2.68 
## 14 Sweden  ets    Trai…  1.19e+ 9 3.02e10 1.86e10   0.745   7.64 0.875 0.984
## 15 Sweden  arima  Trai…  8.48e+ 9 2.89e10 2.01e10   5.18    9.37 0.942 0.944
## 16 Sweden  trend… Trai…  3.22e- 6 5.05e10 3.90e10  29.4    53.3  1.83  1.65 
## # … with 1 more variable: ACF1 <dbl>

# ETS forecasts
USAccDeaths %>%
  ets() %>%
  forecast() %>%
  autoplot()

str(taylor)
plot(taylor)

11.5.1 Plot lagged values

vaelsales_tbl_ts  %>% filter(month(Month) %in% c(3,6,9,12)) %>% gg_lag(sales_GWh, lags = 1:2)

vaelsales_tbl_ts  %>% filter(month(Month) == 1) %>% gg_lag(sales_GWh, lags = 1:2)

vaelsales_tbl_ts %>% ACF(sales_GWh) %>% autoplot()

# decompose(vaelsales_tbl_ts)

vaelsales_tbl_ts %>%
  model(STL(sales_GWh ~ trend(window=21) + season(window='periodic'), robust = TRUE)) %>%
  components() %>%
  autoplot()

vaelsales_tbl_ts %>%
  mutate(ln_sales_GWh = log(sales_GWh)) %>%
  model(STL(ln_sales_GWh ~ trend(window=21) + season(window='periodic'),
    robust = TRUE)) %>%
  components() %>%
  autoplot()

vaelsales_tbl_ts %>%
  features(sales_GWh, feat_stl)

## # A tibble: 1 x 9
##   trend_strength seasonal_streng… seasonal_peak_y… seasonal_trough… spikiness
##            <dbl>            <dbl>            <dbl>            <dbl>     <dbl>
## 1          0.597            0.863                7                4   862916.
## # … with 4 more variables: linearity <dbl>, curvature <dbl>, stl_e_acf1 <dbl>,
## #   stl_e_acf10 <dbl>

vaelsales_tbl_ts %>%
  features(sales_GWh, feature_set(pkgs="feasts"))

## # A tibble: 1 x 48
##   trend_strength seasonal_streng… seasonal_peak_y… seasonal_trough… spikiness
##            <dbl>            <dbl>            <dbl>            <dbl>     <dbl>
## 1          0.597            0.863                7                4   862916.
## # … with 43 more variables: linearity <dbl>, curvature <dbl>, stl_e_acf1 <dbl>,
## #   stl_e_acf10 <dbl>, acf1 <dbl>, acf10 <dbl>, diff1_acf1 <dbl>,
## #   diff1_acf10 <dbl>, diff2_acf1 <dbl>, diff2_acf10 <dbl>, season_acf1 <dbl>,
## #   pacf5 <dbl>, diff1_pacf5 <dbl>, diff2_pacf5 <dbl>, season_pacf <dbl>,
## #   zero_run_mean <dbl>, nonzero_squared_cv <dbl>, zero_start_prop <dbl>,
## #   zero_end_prop <dbl>, lambda_guerrero <dbl>, kpss_stat <dbl>,
## #   kpss_pvalue <dbl>, pp_stat <dbl>, pp_pvalue <dbl>, ndiffs <int>,
## #   nsdiffs <int>, bp_stat <dbl>, bp_pvalue <dbl>, lb_stat <dbl>,
## #   lb_pvalue <dbl>, var_tiled_var <dbl>, var_tiled_mean <dbl>,
## #   shift_level_max <dbl>, shift_level_index <dbl>, shift_var_max <dbl>,
## #   shift_var_index <dbl>, shift_kl_max <dbl>, shift_kl_index <dbl>,
## #   spectral_entropy <dbl>, n_crossing_points <int>, longest_flat_spot <int>,
## #   coef_hurst <dbl>, stat_arch_lm <dbl>