US Broadband – Unlocking the Power of Data Visualization with R

# Week 20 US Broadband
# source of data:
# https://github.com/rfordatascience/tidytuesday/blob/master/data/2021/2021-05-11/readme.md
# https://github.com/microsoft/USBroadbandUsagePercentages

# load libraries ------------------------------------
library(tidytuesdayR)
library(tidyverse)
library(DataExplorer)

library(zipcodeR)
library(janitor)
library(stringr)


# load data -----------------------------------
tuesdata <- tidytuesdayR::tt_load(2021, week = 20)

broadband <- tuesdata$broadband
broadband_zipcode <- tuesdata$broadband_zip

#broadband_zipcode<-read.csv("https://raw.githubusercontent.com/microsoft/USBroadbandUsagePercentages/master/dataset/broadband_data_zipcode.csv")
# contains mean absolute error (MAE), mean signed deviation (MSD)

# check missing data and str------------------------------
profile_missing(broadband);head(broadband)
profile_missing(broadband_zipcode);head(broadband_zipcode)

str(broadband);dim(broadband)
str(broadband_zipcode);dim(broadband_zipcode)


# data wrangling -------------------------------
names(broadband)<-make.names(tolower(names(broadband)))
names(broadband_zipcode)<-make.names(tolower(names(broadband_zipcode)))

broadband%>%filter(str_detect("-",broadband$broadband.availability.per.fcc))
broadband$broadband.availability.per.fcc[broadband$broadband.availability.per.fcc=="-"]<-"0"
broadband$broadband.availability.per.fcc<-as.double(broadband$broadband.availability.per.fcc)

broadband%>%arrange(broadband.usage)
broadband$broadband.usage[broadband$broadband.usage=="-"]<-"0"
broadband$broadband.usage<-as.double(broadband$broadband.usage)

###########################################################

broadband <- broadband %>%
  mutate(county.name=sub(" County","",county.name)) %>%
  separate(county.id,into=c("state.id","county.id"),sep=-3) %>%
  mutate(county.name = case_when(
    county.name=="LaSalle Parish" ~ "La Salle Parish",
    TRUE~county.name))

# separate the county.id by state and county---------------
  broadband_zipcode <- broadband_zipcode %>%
  separate(county.id,into=c("state.id","county.id"),sep=-3)


#####################################
# in broadband_zipcode
# some postal codes are made of 4 digits, to use "geocode_zip"
# need 5 digits, so add a zero at the begin of the string

# plyr::count(sprintf("%05d", broadband_zipcode$postal.code))

broadband_zipcode$postal.code<-sprintf("%05d", broadband_zipcode$postal.code)

# broadband%>%filter(str_detect(county.name,"Bedford"))


##################################################
# make a unified dataset with broadband, postal codes and geodada------------------------------
my_df <- broadband%>%
  full_join(broadband_zipcode, by= c("st","state.id","county.id"))%>%
  select(1,2,3,7,8,5,6,9:12) %>%
  drop_na()


names(my_df)<-c("st","state.id","county.id","county.name","postal.code","brd.available","usage.1119","usage.1020","mae","alpha","msd")

head(my_df);dim(my_df)


#########################################
# check of the counties in the data sets ----------
c$county.name.x[!c$county.name.x%in%d$county.name.y]
d$county.name.y[!d$county.name.y%in%c$county.name.x]

ss<-broadband%>%
  full_join(broadband_zipcode, by= c("st","state.id","county.id")) %>%
  filter(county.name.x %in% c("Bedford city",
                              "Covington city",
                              "Emporia city",
                              "Fairfax city",
                              "Kusilvak Census Area",
                              "Lexington city",
                              "Manassas Park city",
                              "Martinsville city",
                              "Oglala",
                              "Otter Tail")) %>%
  group_by(st,state.id,county.id) %>%
  summarize(unique(county.name.x),unique(county.name.y),unique(postal.code))
####################################


# make a new column with y/n broadband in the county
my_df <- my_df %>%
  mutate(broadband.id=ifelse(brd.available==0,"no","yes"))


# find the geocodes with postal codes -------------------
geocode_zip<-geocode_zip(my_df$postal.code)


# add the geocodes------------------------------
s<-my_df%>%
  inner_join(geocode_zip,by=c("postal.code"="zipcode"))%>%
  unite("id",state.id:county.id,sep= "")


# load the libraries for plotting ---------------------
library(sf)
library(raster)
library(spData)
library(spDataLarge)

library(maps)
library(viridis)
library(ggthemes)
library(RColorBrewer)

# add font to mac---
library(showtext)
library(extrafont)
#font_import(pattern="world of water")
loadfonts()
fonts()

# mapping --------------------------------

mypalette<-display.brewer.pal(7,"BrBG")
us_county_map <- map_data("county")

final_plot<-ggplot()+
  geom_polygon(data=us_county_map,aes(x=long,y=lat,group = group),
               fill=NA,color = "lightblue")+
  geom_point(data=subset(s,lat>25&lat<50),
             aes(x=lng,y=lat, group =st,color=brd.available),
             alpha=0.3,size=0.5)+
  scale_color_viridis(labels = scales::percent)+
  labs(title="America's Broadband",
       subtitle="available values by County",
       caption="Viz. Federica Gazzelloni | US Broadband,Microsoft GitHub,The Verge | TidyTuesday week20",
       color="")+
  theme_map()+
  theme(plot.title =element_text(size=40,face="bold",family ="Courier New",color="black"),
        plot.subtitle =element_text(size=25,face="bold",family ="Courier New"),
        plot.caption =element_text(size=9,face="bold",family ="Courier New"),
        plot.title.position = "panel",
        plot.margin = margin(5,5,5,5),
        legend.text = element_text(size=8,family ="Courier New"))


################################################################################


####### SAVING ######################################
ragg::agg_png(here::here("tidytuesday_Broadband.png"),
              res = 320, width = 14, height = 8, units = "in")
final_plot

dev.off()



#### ATTACHING LOGO ############################ 
library(ggimage)
library(magick)


tidy_logo<-image_read("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/static/plot_logo.png") %>%
  image_resize("300x300")


final_plot <- image_read("tidytuesday_Broadband.png")

attached_logo <- image_composite(final_plot, tidy_logo,
                                 operator="atop",
                                 gravity="northeast") # tell R where to put the logo


image_write(attached_logo, path = "tidytuesday_Broadband.png", format = "png") # save final plot