# Week 20 US Broadband# source of data:# https://github.com/rfordatascience/tidytuesday/blob/master/data/2021/2021-05-11/readme.md# https://github.com/microsoft/USBroadbandUsagePercentages# load libraries ------------------------------------library(tidytuesdayR)library(tidyverse)library(DataExplorer)library(zipcodeR)library(janitor)library(stringr)# load data -----------------------------------tuesdata <- tidytuesdayR::tt_load(2021, week =20)broadband <- tuesdata$broadbandbroadband_zipcode <- tuesdata$broadband_zip#broadband_zipcode<-read.csv("https://raw.githubusercontent.com/microsoft/USBroadbandUsagePercentages/master/dataset/broadband_data_zipcode.csv")# contains mean absolute error (MAE), mean signed deviation (MSD)# check missing data and str------------------------------profile_missing(broadband);head(broadband)profile_missing(broadband_zipcode);head(broadband_zipcode)str(broadband);dim(broadband)str(broadband_zipcode);dim(broadband_zipcode)# data wrangling -------------------------------names(broadband)<-make.names(tolower(names(broadband)))names(broadband_zipcode)<-make.names(tolower(names(broadband_zipcode)))broadband%>%filter(str_detect("-",broadband$broadband.availability.per.fcc))broadband$broadband.availability.per.fcc[broadband$broadband.availability.per.fcc=="-"]<-"0"broadband$broadband.availability.per.fcc<-as.double(broadband$broadband.availability.per.fcc)broadband%>%arrange(broadband.usage)broadband$broadband.usage[broadband$broadband.usage=="-"]<-"0"broadband$broadband.usage<-as.double(broadband$broadband.usage)###########################################################broadband <- broadband %>%mutate(county.name=sub(" County","",county.name)) %>%separate(county.id,into=c("state.id","county.id"),sep=-3) %>%mutate(county.name =case_when( county.name=="LaSalle Parish"~"La Salle Parish",TRUE~county.name))# separate the county.id by state and county--------------- broadband_zipcode <- broadband_zipcode %>%separate(county.id,into=c("state.id","county.id"),sep=-3)###################################### in broadband_zipcode# some postal codes are made of 4 digits, to use "geocode_zip"# need 5 digits, so add a zero at the begin of the string# plyr::count(sprintf("%05d", broadband_zipcode$postal.code))broadband_zipcode$postal.code<-sprintf("%05d", broadband_zipcode$postal.code)# broadband%>%filter(str_detect(county.name,"Bedford"))################################################### make a unified dataset with broadband, postal codes and geodada------------------------------my_df <- broadband%>%full_join(broadband_zipcode, by=c("st","state.id","county.id"))%>%select(1,2,3,7,8,5,6,9:12) %>%drop_na()names(my_df)<-c("st","state.id","county.id","county.name","postal.code","brd.available","usage.1119","usage.1020","mae","alpha","msd")head(my_df);dim(my_df)########################################## check of the counties in the data sets ----------c$county.name.x[!c$county.name.x%in%d$county.name.y]d$county.name.y[!d$county.name.y%in%c$county.name.x]ss<-broadband%>%full_join(broadband_zipcode, by=c("st","state.id","county.id")) %>%filter(county.name.x %in%c("Bedford city","Covington city","Emporia city","Fairfax city","Kusilvak Census Area","Lexington city","Manassas Park city","Martinsville city","Oglala","Otter Tail")) %>%group_by(st,state.id,county.id) %>%summarize(unique(county.name.x),unique(county.name.y),unique(postal.code))##################################### make a new column with y/n broadband in the countymy_df <- my_df %>%mutate(broadband.id=ifelse(brd.available==0,"no","yes"))# find the geocodes with postal codes -------------------geocode_zip<-geocode_zip(my_df$postal.code)# add the geocodes------------------------------s<-my_df%>%inner_join(geocode_zip,by=c("postal.code"="zipcode"))%>%unite("id",state.id:county.id,sep="")# load the libraries for plotting ---------------------library(sf)library(raster)library(spData)library(spDataLarge)library(maps)library(viridis)library(ggthemes)library(RColorBrewer)# add font to mac---library(showtext)library(extrafont)#font_import(pattern="world of water")loadfonts()fonts()# mapping --------------------------------mypalette<-display.brewer.pal(7,"BrBG")us_county_map <-map_data("county")final_plot<-ggplot()+geom_polygon(data=us_county_map,aes(x=long,y=lat,group = group),fill=NA,color ="lightblue")+geom_point(data=subset(s,lat>25&lat<50),aes(x=lng,y=lat, group =st,color=brd.available),alpha=0.3,size=0.5)+scale_color_viridis(labels = scales::percent)+labs(title="America's Broadband",subtitle="available values by County",caption="Viz. Federica Gazzelloni | US Broadband,Microsoft GitHub,The Verge | TidyTuesday week20",color="")+theme_map()+theme(plot.title =element_text(size=40,face="bold",family ="Courier New",color="black"),plot.subtitle =element_text(size=25,face="bold",family ="Courier New"),plot.caption =element_text(size=9,face="bold",family ="Courier New"),plot.title.position ="panel",plot.margin =margin(5,5,5,5),legend.text =element_text(size=8,family ="Courier New"))####################################################################################### SAVING ######################################ragg::agg_png(here::here("tidytuesday_Broadband.png"),res =320, width =14, height =8, units ="in")final_plotdev.off()#### ATTACHING LOGO ############################ library(ggimage)library(magick)tidy_logo<-image_read("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/static/plot_logo.png") %>%image_resize("300x300")final_plot <-image_read("tidytuesday_Broadband.png")attached_logo <-image_composite(final_plot, tidy_logo,operator="atop",gravity="northeast") # tell R where to put the logoimage_write(attached_logo, path ="tidytuesday_Broadband.png", format ="png") # save final plot