7 Data Exploration

Loading packages

library(readr)
library(dplyr)
library(tidyr)
library(openair) # http://davidcarslaw.github.io/openair/
library(purrr)
library(lubridate)
library(ggplot2)
library(stringr)
library(knitr)
library(xts)
library(zoo)
library(gridExtra)
library(astsa)
library(rvest)
library(fpp2)
library(ranger)
library(broom)
library(RcppRoll)
library(reshape2)

Data loading

air_data_2 <- readRDS("data_rds/air_data_2.rds")

7.1 Trends exploration

We take a look to the general trend of several indicators through the last 18 years

# We calcule the yearly mean of the pollutants levels.
year_avgs <- air_data_2 %>% select(station_alias, date_time_utc, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, year = year(date_time_utc)) %>%
  summarise_all(funs(mean(., na.rm = TRUE))) %>% 
  select(-date_time_utc) # We drop this variable

# We export the years_avgs as a csv file

write_csv(year_avgs, "data_final_csvs/year_avgs.csv")

# We convert the table to long format

year_avgs_long <- gather(year_avgs, contaminante, value, 3:length(year_avgs)) %>% 
                    filter(!(grepl('Constit', station_alias) 
                             & year == '2006' & 
                               contaminante %in% c('BEN', 'MXIL', 'TOL'))) %>%
                                # We filter this data because is only completed in 0.01%
                    filter(!(grepl('Constit', station_alias) &
                              year == '2008' & contaminante == 'PM25'))
                                # We filter this data because is only completed in 0.02%

# We present the data in a grid of graphs

ggplot(year_avgs_long, aes(x = year, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

# We calcule the hourly mean of the pollutants levels.
hour_avgs <- air_data_2 %>% select(station_alias, hour, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, hour) %>%
  summarise_all(funs(mean(., na.rm = TRUE)))  # quito ahora esta variable, porque no tiene sentido que salga su media.

# We convert the table to long format

hour_avgs_long <- gather(hour_avgs, contaminante, value, 3:length(hour_avgs))

# We present the data in a grid of graphs

ggplot(hour_avgs_long, aes(x = hour, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

# We calcule the monthly mean of the pollutants levels.
month_avgs <- air_data_2 %>% select(station_alias, month, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, month) %>%
  summarise_all(funs(mean(., na.rm = TRUE)))  # quito ahora esta variable, porque no tiene sentido que salga su media.

# We convert the table to long format

month_avgs_long <- gather(month_avgs, contaminante, value, 3:length(month_avgs))

# We present the data in a grid of graphs

ggplot(month_avgs_long, aes(x = month, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

# We calcule the weekly mean of the pollutants levels.
week_day_avgs <- air_data_2 %>% select(station_alias, week_day, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, week_day) %>%
  summarise_all(funs(mean(., na.rm = TRUE)))  # quito ahora esta variable, porque no tiene sentido que salga su media.

# We convert the table to long format

week_day_avgs_long <- gather(week_day_avgs, contaminante, value, 3:length(week_day_avgs))

# We present the data in a grid of graphs

ggplot(week_day_avgs_long, aes(x = week_day, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

We take a look to the distributions

hist_table <- air_data_2 %>% filter(year(date_time_utc) == '2017') %>% 
                        select(station_alias, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL)

# We convert the table to long format

hist_table_long <- hist_table %>% gather(pollutant, value, 2:length(hist_table))

# We present the data in a grid of graphs

ggplot(hist_table_long, aes(x = value)) + 
  geom_histogram() + 
  xlim(c(0, 150)) +
  facet_grid(pollutant~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

7.2 PM10 Constitucion Station

We create the dataset pm10 with PM10 values from the Constitución Station and we execute a summary

pm10 <- air_data_2 %>% filter(station == '1') %>%
                        select(date_time_utc, PM10)

summary(pm10)

##  date_time_utc                      PM10       
##  Min.   :2000-01-01 00:00:00   Min.   :  0.00  
##  1st Qu.:2004-06-30 23:15:00   1st Qu.: 19.00  
##  Median :2009-01-02 00:30:00   Median : 29.00  
##  Mean   :2008-12-31 18:50:45   Mean   : 34.39  
##  3rd Qu.:2013-07-02 23:45:00   3rd Qu.: 44.00  
##  Max.   :2017-12-31 23:00:00   Max.   :888.00  
##                                NA's   :3106

25% of the values are between 44.00 and 888.00. 888.00 is a value really extreme. How many extreme values (outliers) do we have in this series? We plot all the values to visualise this:

ggplot(pm10, aes(x = date_time_utc, y = PM10)) + 
         geom_point(alpha = 0.1)

We have very few values greater than 250. So, it doesn’t seem we have a problem with the outliers.

We filter out the most extreme values to improve the visualization, but

pm10_no_outliers <- pm10 %>% filter(PM10 <= 300)
ggplot(pm10_no_outliers, aes(x = date_time_utc, y = PM10)) + 
         geom_point(alpha = 0.1)

ggplot(pm10, aes(x = date_time_utc, y = PM10)) + 
         geom_point(alpha = 0.1, size = 0.2) +
          scale_y_continuous(trans='log2')

PM10_2017 <- pm10 %>% filter(year(date_time_utc) == '2017') 
ggplot(PM10_2017, aes(x = date_time_utc, y = PM10)) + 
         geom_point(alpha = 0.1, size = 1) +
          scale_y_continuous(trans='log2')

Daily averages

We create a new dataset with the PM10 daily averages and we plot them in a new graphic. We add a trend line too. There is a clear downward trend in the measurements and we have many fewer extreme values during the last decade. It seems like we have two very clear “epochs” in the data, with a clear change in the data structure starting somewhere around 2009.

pm10_day_avg <- pm10 %>% group_by(day = date(date_time_utc)) %>%
                          summarise(day_avg = mean(PM10, na.rm = TRUE))

ggplot(pm10_day_avg, aes(x = day, y = day_avg, , colour = day_avg)) + 
         geom_point(alpha = 0.5) +
         geom_smooth(color = "grey", alpha = 0.2) +
         scale_colour_gradientn(colours = terrain.colors(10)) +
         theme(legend.position = c(0.3, 0.9),
                legend.background = element_rect(colour = "transparent", fill = NA), legend.direction = "horizontal") +
         labs(colour = "PM10 daily average (colour scale)", x = "Year", y = "PM10 daily average", title = "PM10 daily average - 2000-2017 evolution (Constitucion station)")

We plot the same data but through a box plot graph. Now we can see very clearly how since 2009 the behaviour of the data is sustantially different from the precedent years. The PM10 day levels are smaller, the variability is significantly less pronounced and the trend is smoother too.

pm10_day_avg <- pm10 %>% group_by(day = date(date_time_utc)) %>%
                          summarise(day_avg = mean(PM10, na.rm = TRUE))

ggplot(pm10_day_avg, aes(y = day_avg, x = as.factor(year(day)), colour = day_avg)) + 
         geom_boxplot(alpha = 0.5)

We identify a very clear trend through the years on the last graph. But, as we already saw before on the grid graphs there are other things happening at the same time.

year_const <- year_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'PM10')
plot1 <- ggplot(year_const, aes(x = year, y = value)) + 
  geom_line()

month_const <- month_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'PM10')
plot2 <- ggplot(month_const, aes(x = month, y = value)) + 
  geom_line()

week_day_const <- week_day_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'PM10')
plot3 <- ggplot(week_day_const, aes(x = week_day, y = value)) + 
  geom_line()

hour_const <- hour_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'PM10')
plot4 <- ggplot(hour_const, aes(x = hour, y = value)) + 
  geom_line()

grid.arrange(plot1, plot2, plot3, plot4, ncol = 2)

7.3 NO2 Constitucion Station

We create the dataset pm10 with PM10 values from the Constitución Station and we execute a summary

NO2 <- air_data_2 %>% filter(station == '1') %>%
                        select(date_time_utc, NO2)

summary(NO2)

##  date_time_utc                      NO2        
##  Min.   :2000-01-01 00:00:00   Min.   :  0.00  
##  1st Qu.:2004-06-30 23:15:00   1st Qu.: 20.00  
##  Median :2009-01-02 00:30:00   Median : 34.00  
##  Mean   :2008-12-31 18:50:45   Mean   : 37.26  
##  3rd Qu.:2013-07-02 23:45:00   3rd Qu.: 51.00  
##  Max.   :2017-12-31 23:00:00   Max.   :408.00  
##                                NA's   :2930

NO2_day_avg <- NO2 %>% group_by(day = date(date_time_utc)) %>%
                          summarise(day_avg = mean(NO2, na.rm = TRUE))

ggplot(NO2_day_avg, aes(x = day, y = day_avg, , colour = day_avg)) + 
         geom_point(alpha = 0.5) +
         geom_smooth(color = "grey", alpha = 0.2) +
         scale_colour_gradientn(colours = terrain.colors(10)) +
         theme(legend.position = c(0.3, 0.9),
                legend.background = element_rect(colour = "transparent", fill = NA), legend.direction = "horizontal") +
         labs(colour = "PM10 daily average (colour scale)", x = "Year", y = "PM10 daily average", title = "NO2 daily average - 2000-2017 evolution (Constitucion station)")

significantly less pronounced and the trend is smoother too.

NO2_day_avg <- NO2 %>% group_by(day = date(date_time_utc)) %>%
                          summarise(day_avg = mean(NO2, na.rm = TRUE))

ggplot(NO2_day_avg, aes(y = day_avg, x = as.factor(year(day)), colour = day_avg)) + 
         geom_boxplot(alpha = 0.5)

year_const <- year_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'NO2')
plot1 <- ggplot(year_const, aes(x = year, y = value)) + 
  geom_line()

month_const <- month_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'NO2')
plot2 <- ggplot(month_const, aes(x = month, y = value)) + 
  geom_line()

week_day_const <- week_day_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'NO2')
plot3 <- ggplot(week_day_const, aes(x = week_day, y = value)) + 
  geom_line()

hour_const <- hour_avgs_long %>% filter(grepl('Constit', station_alias), contaminante == 'NO2')
plot4 <- ggplot(hour_const, aes(x = hour, y = value)) + 
  geom_line()

grid.arrange(plot1, plot2, plot3, plot4, ncol = 2)

7.4 Relationships between variables

The Constitucion Station is the only station with meteorological data. So, we are going to focus our efforts of data exploration on this station.

Reference: http://www.sthda.com/english/wiki/ggplot2-quick-correlation-matrix-heatmap-r-software-and-data-visualization

Pollutants correlations matrix

constitucion_data <- readRDS("data_rds/constitucion_data.rds")

### Pollutants correlation matrix

# We select the pollutants variables
pollutants <- constitucion_data %>% select(PM10, PM25, NO2, NO, SO2, CO, O3, MXIL, TOL) %>%
  na.omit()

# We create the pollutants correlation matrix
cor_matrix <- round(cor(pollutants), 2)

# We transform the table to long format to be handled by ggplot2
long_cor_matrix <- melt(cor_matrix)

# Get lower triangle of the correlation matrix
get_lower_tri<-function(cor_matrix){
  cor_matrix[upper.tri(cor_matrix)] <- NA
  return(cor_matrix)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cor_matrix){
  cor_matrix[lower.tri(cor_matrix)]<- NA
  return(cor_matrix)
}

upper_tri <- get_upper_tri(cor_matrix)

# Melt the correlation matrix
library(reshape2)
long_cor_matrix <- melt(upper_tri, na.rm = TRUE)

# Create a ggheatmap
ggheatmap <- ggplot(long_cor_matrix, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()


pollutants_heatmap <- ggheatmap +  geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.ticks = element_blank(),
    legend.justification = c(1, 0),
    legend.position = c(0.6, 0.7),
    legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5))

print(pollutants_heatmap)

Pollutants-weather data correlations matrix

constitucion_data <- readRDS("data_rds/constitucion_data.rds")

### Pollutants correlation matrix

# We select the pollutants variables
pollutants_weather <- constitucion_data %>% select(PM10, PM25, NO2, NO, SO2, CO, O3, MXIL, TOL, dd, vv, TMP, PRB, LL, HR, RS) %>%
  na.omit()

# We create the pollutants correlation matrix
cor_matrix <- round(cor(pollutants_weather), 2)

# We transform the table to long format to be handled by ggplot2
long_cor_matrix <- melt(cor_matrix)

# Get lower triangle of the correlation matrix
get_lower_tri<-function(cor_matrix){
  cor_matrix[upper.tri(cor_matrix)] <- NA
  return(cor_matrix)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cor_matrix){
  cor_matrix[lower.tri(cor_matrix)]<- NA
  return(cor_matrix)
}

upper_tri <- get_upper_tri(cor_matrix)

# Melt the correlation matrix
library(reshape2)
long_cor_matrix <- melt(upper_tri, na.rm = TRUE)

# Create a ggheatmap
ggheatmap <- ggplot(long_cor_matrix, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()


pollutants_heatmap <- ggheatmap +  geom_text(aes(Var2, Var1, label = value), color = "black", size = 2) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.ticks = element_blank(),
    legend.justification = c(1, 0),
    legend.position = c(0.6, 0.7),
    legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5))

print(pollutants_heatmap)

The openair package is a R package especialized in the visualizations of air quality data.

Wind visualizations (openair package)

windRose(air_data_2)

Pollution rose ((openair package))

pollutionRose(air_data_2, pollutant = "PM10")