7 Data Exploration

Loading packages

library(readr)
library(dplyr)
library(tidyr)
library(openair) # http://davidcarslaw.github.io/openair/
library(purrr)
library(lubridate)
library(ggplot2)
library(stringr)
library(knitr)
library(xts)
library(zoo)
library(gridExtra)
library(astsa)
library(rvest)
library(fpp2)
library(ranger)
library(broom)
library(RcppRoll)
library(reshape2)

Data loading

air_data_2 <- readRDS("data_rds/air_data_2.rds")

7.2 PM10 Constitucion Station

We create the dataset pm10 with PM10 values from the Constitución Station and we execute a summary

pm10 <- air_data_2 %>% filter(station == '1') %>%
                        select(date_time_utc, PM10)

summary(pm10)
##  date_time_utc                      PM10       
##  Min.   :2000-01-01 00:00:00   Min.   :  0.00  
##  1st Qu.:2004-06-30 23:15:00   1st Qu.: 19.00  
##  Median :2009-01-02 00:30:00   Median : 29.00  
##  Mean   :2008-12-31 18:50:45   Mean   : 34.39  
##  3rd Qu.:2013-07-02 23:45:00   3rd Qu.: 44.00  
##  Max.   :2017-12-31 23:00:00   Max.   :888.00  
##                                NA's   :3106

25% of the values are between 44.00 and 888.00. 888.00 is a value really extreme. How many extreme values (outliers) do we have in this series? We plot all the values to visualise this:

ggplot(pm10, aes(x = date_time_utc, y = PM10)) + 
         geom_point(alpha = 0.1)

We have very few values greater than 250. So, it doesn’t seem we have a problem with the outliers.

We filter out the most extreme values to improve the visualization, but

pm10_no_outliers <- pm10 %>% filter(PM10 <= 300)
ggplot(pm10_no_outliers, aes(x = date_time_utc, y = PM10)) + 
         geom_point(alpha = 0.1)

ggplot(pm10, aes(x = date_time_utc, y = PM10)) + 
         geom_point(alpha = 0.1, size = 0.2) +
          scale_y_continuous(trans='log2')

PM10_2017 <- pm10 %>% filter(year(date_time_utc) == '2017') 
ggplot(PM10_2017, aes(x = date_time_utc, y = PM10)) + 
         geom_point(alpha = 0.1, size = 1) +
          scale_y_continuous(trans='log2')

Daily averages

We create a new dataset with the PM10 daily averages and we plot them in a new graphic. We add a trend line too. There is a clear downward trend in the measurements and we have many fewer extreme values during the last decade. It seems like we have two very clear “epochs” in the data, with a clear change in the data structure starting somewhere around 2009.

pm10_day_avg <- pm10 %>% group_by(day = date(date_time_utc)) %>%
                          summarise(day_avg = mean(PM10, na.rm = TRUE))

ggplot(pm10_day_avg, aes(x = day, y = day_avg, , colour = day_avg)) + 
         geom_point(alpha = 0.5) +
         geom_smooth(color = "grey", alpha = 0.2) +
         scale_colour_gradientn(colours = terrain.colors(10)) +
         theme(legend.position = c(0.3, 0.9),
                legend.background = element_rect(colour = "transparent", fill = NA), legend.direction = "horizontal") +
         labs(colour = "PM10 daily average (colour scale)", x = "Year", y = "PM10 daily average", title = "PM10 daily average - 2000-2017 evolution (Constitucion station)")

We plot the same data but through a box plot graph. Now we can see very clearly how since 2009 the behaviour of the data is sustantially different from the precedent years. The PM10 day levels are smaller, the variability is significantly less pronounced and the trend is smoother too.

pm10_day_avg <- pm10 %>% group_by(day = date(date_time_utc)) %>%
                          summarise(day_avg = mean(PM10, na.rm = TRUE))

ggplot(pm10_day_avg, aes(y = day_avg, x = as.factor(year(day)), colour = day_avg)) + 
         geom_boxplot(alpha = 0.5)

We identify a very clear trend through the years on the last graph. But, as we already saw before on the grid graphs there are other things happening at the same time.

year_const <- year_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'PM10')
plot1 <- ggplot(year_const, aes(x = year, y = value)) + 
  geom_line()

month_const <- month_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'PM10')
plot2 <- ggplot(month_const, aes(x = month, y = value)) + 
  geom_line()

week_day_const <- week_day_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'PM10')
plot3 <- ggplot(week_day_const, aes(x = week_day, y = value)) + 
  geom_line()

hour_const <- hour_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'PM10')
plot4 <- ggplot(hour_const, aes(x = hour, y = value)) + 
  geom_line()

grid.arrange(plot1, plot2, plot3, plot4, ncol = 2)

7.3 NO2 Constitucion Station

We create the dataset pm10 with PM10 values from the Constitución Station and we execute a summary

NO2 <- air_data_2 %>% filter(station == '1') %>%
                        select(date_time_utc, NO2)

summary(NO2)
##  date_time_utc                      NO2        
##  Min.   :2000-01-01 00:00:00   Min.   :  0.00  
##  1st Qu.:2004-06-30 23:15:00   1st Qu.: 20.00  
##  Median :2009-01-02 00:30:00   Median : 34.00  
##  Mean   :2008-12-31 18:50:45   Mean   : 37.26  
##  3rd Qu.:2013-07-02 23:45:00   3rd Qu.: 51.00  
##  Max.   :2017-12-31 23:00:00   Max.   :408.00  
##                                NA's   :2930
NO2_day_avg <- NO2 %>% group_by(day = date(date_time_utc)) %>%
                          summarise(day_avg = mean(NO2, na.rm = TRUE))

ggplot(NO2_day_avg, aes(x = day, y = day_avg, , colour = day_avg)) + 
         geom_point(alpha = 0.5) +
         geom_smooth(color = "grey", alpha = 0.2) +
         scale_colour_gradientn(colours = terrain.colors(10)) +
         theme(legend.position = c(0.3, 0.9),
                legend.background = element_rect(colour = "transparent", fill = NA), legend.direction = "horizontal") +
         labs(colour = "PM10 daily average (colour scale)", x = "Year", y = "PM10 daily average", title = "NO2 daily average - 2000-2017 evolution (Constitucion station)")

significantly less pronounced and the trend is smoother too.

NO2_day_avg <- NO2 %>% group_by(day = date(date_time_utc)) %>%
                          summarise(day_avg = mean(NO2, na.rm = TRUE))

ggplot(NO2_day_avg, aes(y = day_avg, x = as.factor(year(day)), colour = day_avg)) + 
         geom_boxplot(alpha = 0.5)

year_const <- year_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'NO2')
plot1 <- ggplot(year_const, aes(x = year, y = value)) + 
  geom_line()

month_const <- month_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'NO2')
plot2 <- ggplot(month_const, aes(x = month, y = value)) + 
  geom_line()

week_day_const <- week_day_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'NO2')
plot3 <- ggplot(week_day_const, aes(x = week_day, y = value)) + 
  geom_line()

hour_const <- hour_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'NO2')
plot4 <- ggplot(hour_const, aes(x = hour, y = value)) + 
  geom_line()

grid.arrange(plot1, plot2, plot3, plot4, ncol = 2)

7.4 Relationships between variables

The Constitucion Station is the only station with meteorological data. So, we are going to focus our efforts of data exploration on this station.

Reference: http://www.sthda.com/english/wiki/ggplot2-quick-correlation-matrix-heatmap-r-software-and-data-visualization

Pollutants correlations matrix

constitucion_data <- readRDS("data_rds/constitucion_data.rds")

### Pollutants correlation matrix

# We select the pollutants variables
pollutants <- constitucion_data %>% select(PM10, PM25, NO2, NO, SO2, CO, O3, MXIL, TOL) %>%
  na.omit()

# We create the pollutants correlation matrix
cor_matrix <- round(cor(pollutants), 2)

# We transform the table to long format to be handled by ggplot2
long_cor_matrix <- melt(cor_matrix)

# Get lower triangle of the correlation matrix
get_lower_tri<-function(cor_matrix){
  cor_matrix[upper.tri(cor_matrix)] <- NA
  return(cor_matrix)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cor_matrix){
  cor_matrix[lower.tri(cor_matrix)]<- NA
  return(cor_matrix)
}

upper_tri <- get_upper_tri(cor_matrix)

# Melt the correlation matrix
library(reshape2)
long_cor_matrix <- melt(upper_tri, na.rm = TRUE)

# Create a ggheatmap
ggheatmap <- ggplot(long_cor_matrix, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()


pollutants_heatmap <- ggheatmap +  geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.ticks = element_blank(),
    legend.justification = c(1, 0),
    legend.position = c(0.6, 0.7),
    legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5))

print(pollutants_heatmap)

Pollutants-weather data correlations matrix

constitucion_data <- readRDS("data_rds/constitucion_data.rds")

### Pollutants correlation matrix

# We select the pollutants variables
pollutants_weather <- constitucion_data %>% select(PM10, PM25, NO2, NO, SO2, CO, O3, MXIL, TOL, dd, vv, TMP, PRB, LL, HR, RS) %>%
  na.omit()

# We create the pollutants correlation matrix
cor_matrix <- round(cor(pollutants_weather), 2)

# We transform the table to long format to be handled by ggplot2
long_cor_matrix <- melt(cor_matrix)

# Get lower triangle of the correlation matrix
get_lower_tri<-function(cor_matrix){
  cor_matrix[upper.tri(cor_matrix)] <- NA
  return(cor_matrix)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cor_matrix){
  cor_matrix[lower.tri(cor_matrix)]<- NA
  return(cor_matrix)
}

upper_tri <- get_upper_tri(cor_matrix)

# Melt the correlation matrix
library(reshape2)
long_cor_matrix <- melt(upper_tri, na.rm = TRUE)

# Create a ggheatmap
ggheatmap <- ggplot(long_cor_matrix, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()


pollutants_heatmap <- ggheatmap +  geom_text(aes(Var2, Var1, label = value), color = "black", size = 2) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.ticks = element_blank(),
    legend.justification = c(1, 0),
    legend.position = c(0.6, 0.7),
    legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5))

print(pollutants_heatmap)

The openair package is a R package especialized in the visualizations of air quality data.

Wind visualizations (openair package)

windRose(air_data_2)

Pollution rose ((openair package))

pollutionRose(air_data_2, pollutant = "PM10")