library(tidyverse)
library(ISLR2)
# data()
Mid-Atlantic Wage Data
Wage and other data for a group of 3000 male workers in the Mid-Atlantic region.
%>%names Wage
summary(Wage)
<- Wage %>%
wage1 mutate(across(where(is.factor),as.character)) %>% #count(region)
mutate(maritl_id=gsub("\\D","",maritl),
race_id=gsub("\\D","",race),
education_id=gsub("\\D","",education),
jobclass_id=gsub("\\D","",jobclass),
health_id=gsub("\\D","",health),
health_ins_id=gsub("\\D","",health_ins)) %>%
select(-maritl,-race,-education,-region,-jobclass,-health,-health_ins) %>%
mutate(across(where(is.character),as.integer))
<- colorRampPalette(c("#91CBD765", "#CA225E"))
tmwr_cols %>%
wage1 cor() %>%
::corrplot(col = tmwr_cols(200), tl.col = "black") +
corrplot::facet_wrap(~wage) ggplot2
library(tidymodels)
tidymodels_prefer()
library(corrplot)
library(ggforce)
library(bestNormalize)
set.seed(1701)
<- initial_split(wage1, strata = wage, prop = 3/4)
split
<- training(split)
train <- testing(split)
test
set.seed(1702)
<- validation_split(train, strata = wage, prop = 4/5)
val $splits[[1]] val
<-
rec # Use the training data from the val split object
recipe(wage ~ ., data = analysis(val$splits[[1]])) %>%
step_select(-logwage)%>%
step_zv(all_numeric_predictors()) %>%
step_orderNorm(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
<- prep(rec)
rec_trained
<-
show_variables %>%
rec prep(log_changes = TRUE)
<- val$splits %>% pluck(1) %>% assessment()
validation <- bake(rec_trained, new_data = validation) val_processed
%>%prep()%>%bake(new_data=NULL) %>%
rec::tidy() broom
library(ggdist)
library(distributional)
%>%prep()%>%bake(new_data=NULL) %>%
rec::tidy() %>%
broomfilter(!column=="wage")%>%
ggplot(aes(x=column))+
geom_col(aes(y=mean))
library(ggdist)
library(distributional)
<- Wage %>%#count(age)
Wage_age_cat mutate(age_cat=cut(age,breaks = 5)) %>% #count(race)
select(-age,-logwage)
# average calculation
<- glm(wage~ education+race+age_cat+maritl-1,family = "gaussian",data=Wage_age_cat)
mod
%>%
modtidy() %>%
mutate(term0=case_when(str_detect(term,"education")~"education",
str_detect(term,"race")~"race",
str_detect(term,"maritl")~"maritl",
str_detect(term,"age_cat")~"age_cat",
TRUE~term)) %>%
mutate(term1=gsub("^[A-z]+\\d. ","",term)) %>%
ggplot(aes(y = fct_reorder(term1,estimate),
xdist = dist_student_t(df = df.residual(mod),
mu = estimate,
sigma = std.error))
+
) ::stat_halfeye()+
ggdiststat_dots(position = "dodge")+ # , color = "pink"
facet_wrap(vars(term0),scales = "free")+
::scale_color_hilda()+
tvthemes::theme_theLastAirbender() tvthemes
%>%
modtidy() %>%
mutate(term0=case_when(str_detect(term,"education")~"education",
str_detect(term,"race")~"race",
str_detect(term,"maritl")~"maritl",
str_detect(term,"age_cat")~"age_cat",
TRUE~term)) %>%
mutate(term1=gsub("^[A-z]+\\d. ","",term)) %>%
ggplot(aes(y = fct_reorder(term1,estimate),
xdist = dist_student_t(df = df.residual(mod),
mu = estimate,
sigma = std.error))
+
) ::stat_halfeye()+
ggdiststat_dots(position = "dodge")+ # , color = "pink"
facet_wrap(vars(term0),scales = "free")+
::scale_color_hilda()+
tvthemes::theme_theLastAirbender() tvthemes
%>%
Wagemutate(race=as.character(race)) %>%
ggplot()+
aes(x=wage,y=race)+
::geom_dotsinterval(layout="weave",side="bottom")+
ggdist::stat_halfeye() ggdist
library(extrafont)
loadfonts()
%>%
Wage mutate(education=gsub("\\d. ","",education)) %>% #count(year)
group_by(education)%>%
mutate(mean=mean(wage),
sd=sd(wage)) %>%
ungroup() %>% # pull(mean)%>%summary
select(education,mean,sd) %>%
distinct()%>%
ggplot(aes(y=fct_reorder(education,mean),
xdist = dist_normal(mean, sd),
layout = "weave",
fill = stat(x < 111.70))) +
stat_dots(position = "dodge", color = "grey70")+
geom_vline(xintercept = 111.70, alpha = 0.25) +
scale_x_continuous(breaks = c(20,60,90,112,140,180,220)) +
::scale_fill_hilda()+
tvthemeslabs(x="Wage values from 2003 to 2009",
y="",color="Race",fill="wage < avg",
title="Wage distribution vs education 2003-2009",
subtitle="Normalized values",
caption="#30DayChartChallenge 2022 #day9 - Distribution/Statistics - v2\nDataSource: {ISLR2} Wage dataset | DataViz: Federica Gazzelloni")+
::theme_avatar()+
tvthemestheme(text = element_text(family="Chelsea Market"),
legend.background = element_blank(),
legend.box.background = element_blank(),
legend.key = element_blank(),
legend.key.width = unit(0.5,units="cm"),
legend.direction = "horizontal",
legend.position = c(0.8,0.1))
ggsave("education.png")
%>%
Wage mutate(education=gsub("\\d. ","",education)) %>% #count(year)
group_by(education)%>%
mutate(mean=mean(wage),
sd=sd(wage)) %>%
ungroup() %>%
ggplot(aes(x = wage,
y =fct_reorder(education,wage),color=race)) +
stat_dots(side = "both",size=2.5)+
scale_color_brewer(palette = "Dark2") +
xlim(20,200)+
labs(x="Wage values from 2003 to 2009",
y="",color="Race")+
::theme_theLastAirbender()+
tvthemestheme(legend.background = element_blank(),
legend.box.background = element_blank(),
legend.key = element_blank(),
legend.key.width = unit(0.5,units="cm"),
legend.direction = "horizontal",
legend.position = c(0.2,0.97))
<- function(recipe,
plot_validation_results dat = assessment(val$splits[[1]])) {
%>%
recipe # Estimate any additional steps
prep() %>%
# Process the data (the validation set by default)
bake(new_data = dat) %>%
# Create the scatterplot matrix
ggplot(aes(x = .panel_x, y = .panel_y, col = wage, fill = wage)) +
geom_point(alpha = 0.4, size = 0.5) +
geom_autodensity(alpha = .3) +
facet_matrix(vars(-wage), layer.diag = 2) +
::scale_color_viridis(option = "A") +
viridis::scale_fill_viridis(option = "A")
viridis }
%>%
rec_trained step_pca(all_numeric_predictors(), num_comp = 4) %>%
plot_validation_results() +
ggtitle("Principal Component Analysis")+
::theme_theLastAirbender() tvthemes
%>%
rec_trained step_pls(all_numeric_predictors(), outcome = "wage", num_comp = 4) %>%
plot_validation_results() +
ggtitle("Partial Least Squares")+
::theme_theLastAirbender() tvthemes
%>%
rec_trained step_ica(all_numeric_predictors(), num_comp = 4) %>%
plot_validation_results() +
ggtitle("Independent Component Analysis")+
::theme_theLastAirbender() tvthemes
%>%
Wagemutate(age1=cut(age,5),.after=age)%>%
mutate(year=as.factor(year))%>%
ggplot(aes(year,wage,group=year))+
geom_violin()+
# facet_wrap(~age1)+
::theme_theLastAirbender() tvthemes
<- Wage%>%#names
p1 ggplot(aes(age,logwage,color=maritl,fill=maritl))+
geom_jitter(size=0.5,alpha=0.5,shape=21,stroke=0.5)+
geom_smooth(size=0.5,se=F)+#,color="darkred")+
labs(title="\n")+
scale_color_hilda(palette = "Day",n=6, type = "discrete",reverse=T) +
::theme_theLastAirbender(text.font = "Times", title.font = "Times",
tvthemeslegend.font = "Times")+
theme(axis.text.x = element_blank(),
legend.position = "top")
p1
library(extrafont)
loadfonts()
library(hrbrthemes)
library(tvthemes)
library(ggthemes)
library(geomtextpath)
::geom_textpath()
geomtextpath
<-Wage%>%
p2 pivot_longer(cols = c("year","age","wage"),names_to="names",values_to="values")%>%
ggplot(aes(values))+
geom_textdensity(aes(label=names,color="red"),size = 6,
fontface = 2, #fontfamily= "Chelsea Market",
hjust = 0.2, vjust = 0.3,
show.legend = F) +
facet_wrap(~names,scales = "free")+
scale_color_hilda(palette = "Day",n=6, type = "discrete",reverse=T) +
::theme_theLastAirbender(text.font = "Times", title.font = "Times",
tvthemeslegend.font = "Times")+
theme(axis.text.x = element_blank())
p2
library(ggdist)
%>%
Wagepivot_longer(cols = c("year","age","wage"),names_to="names",values_to="values")%>%
ggplot(aes(values))+
geom_slabinterval()
library(cowplot)
<- ggdraw()+
final draw_plot(p1) +
draw_plot(p2,scale=0.5)
::annotate_figure(
ggpubr
final,top = "Ciao",
bottom = "addio",
left = "align",
right = "ok",
fig.lab = "a",
fig.lab.pos = c("top.left", "top", "top.right", "bottom.left", "bottom",
"bottom.right"),
fig.lab.size=2,
fig.lab.face="bold"
)
%>%#names
Wageggplot(aes(race,wage))+
geom_col()+
coord_polar(theta = "y")
%>%#names
Wageggplot(aes(education,wage))+
geom_col()+
coord_polar(theta = "x")
%>%
Wagemutate(maritl=gsub("\\d. ","",maritl))%>%
ggplot(aes(maritl))+
geom_textdensity(aes(label=maritl,color=maritl),size = 6,
fontface = 2, #fontfamily= "Chelsea Market",
hjust = 0.2, vjust = 0.3,
show.legend = F) +
# facet_wrap(~names,scales = "free")+
scale_color_hilda(palette = "Day",n=6, type = "discrete",reverse=T) +
::theme_theLastAirbender(text.font = "Times", title.font = "Times",
tvthemeslegend.font = "Times")+
theme(axis.text.x = element_blank())
%>%
Wagemutate(race=gsub("\\d. ","",race))%>%
ggplot(aes(race))+
geom_textdensity(aes(label=race,color=race),size = 6,
fontface = 2, #fontfamily= "Chelsea Market",
hjust = 0.2, vjust = 0.3,
show.legend = F) +
# facet_wrap(~names,scales = "free")+
scale_color_hilda(palette = "Day",n=6, type = "discrete",reverse=T) +
::theme_theLastAirbender(text.font = "Times", title.font = "Times",
tvthemeslegend.font = "Times")+
theme(axis.text.x = element_blank())
%>%#count(education)
Wagemutate(education=gsub("\\d. ","",education),
education=case_when(education=="< HS Grad"~"Under graduate",
=="Advanced Degree"~"Degree",
educationTRUE~"Graduate"))%>%
ggplot(aes(race))+
geom_textdensity(aes(label=education,color=education),
size = 6,
fontface = 2,
hjust = 0.2, vjust = 0.3,
show.legend = F) +
# facet_wrap(~names,scales = "free")+
scale_color_hilda(palette = "Day",n=6, type = "discrete",reverse=T) +
facet_wrap(~education)+
::theme_theLastAirbender(text.font = "Times", title.font = "Times",
tvthemeslegend.font = "Times")+
theme(axis.text.x = element_blank())
library(hrbrthemes)
library(ggthemes)
library(extrafont)
::loadfonts()
extrafontfonts()
::import_chelseaMarket()
tvthemes%>%
Wageselect(wage,age,race)%>%
distinct()%>%
ggplot(aes(x=wage,y=age,shape=race))+
geom_point(aes(size=wage,color=race))+
geom_smooth(method=lm,se=FALSE,
fullrange=TRUE,
aes(color=race))+
scale_x_log10()+
scale_y_log10()+
xlim(20,250)+
labs(title="Multivariate Wage Analysis - age and race",
subtitle="Years: 2003 to 2009",
caption="#30DayChartChallenge 2022 #day15 - Multivariate\nDataSource: {ISLR2} Wage dataset | DataViz: Federica Gazzelloni",
shape="Race",color="Race",size="Wage",
x="Wage",y="Age")+
::scale_shape_tableau()+
ggthemes::scale_color_tableau()+
ggthemes::theme_pander()+
ggthemestheme(text = element_text(family="Chelsea Market"))
ggsave("day15_multivariate.png",
dpi=320,
width = 9,
height = 6)