第 4 章 stat functions and positions

4.1 直方圖

4.1.1 間斷變數

geom_bar()：用來呈現不同x類別的樣本個數。
- 樣本個數會自動計算，呈現在y軸。
geom_col()：用來呈現不同x類別下y值高度。
- data frame要提供y值。

圖 4.1: Britain may soon have one of the highest minimum wages in the world

4.1.2 geom_col

aes_y是由資料給定

初任人員平均經常性薪資

範例：geom_col
初任人員平均經常性薪資：工業

startSalaryTopCat<- read_csv("https://raw.githubusercontent.com/tpemartin/github-data/master/startSalaryTopCat.csv")

startSalaryTopCat$大職業別[2:7] %>% str_c(.,collapse="','")

[1] “工業部門’,‘礦業及土石採取業’,‘製造業’,‘電力及燃氣供應業’,‘用水供應及污染整治業’,’營造業”

startSalaryTopCat %>% filter(
  大職業別 %in% c('工業部門','礦業及土石採取業','製造業','電力及燃氣供應業','用水供應及污染整治業','營造業')
) -> startingSalary_industrial

graphList <- list()

startingSalary_industrial %>%
  filter(大職業別 !='工業部門') -> startingSalary_industrial_sub
startingSalary_industrial_sub %>%  
  ggplot(aes(x=大職業別))+
  geom_col(aes(y=`經常性薪資-薪資`))-> graphList$經常薪資_col0

graphList$經常薪資_col0

4.1.2.1 改變排序

作法1

作法1: 將大職業別改成facotr, 其levels以經常性薪資-薪資排序。

startingSalary_industrial_sub %>%
  mutate(
    大職業別ordered=reorder(大職業別,
                    `經常性薪資-薪資`,order=T) # order=T才會輸出成ordered factor
  ) -> startingSalary_industrial_sub

startingSalary_industrial_sub %>%
  ggplot()+
  geom_col(
    aes(x=大職業別ordered,y=`經常性薪資-薪資`)
  ) -> graphList$經常薪資_x有排序ed_col0

graphList$經常薪資_x有排序ed_col0

另一個更簡潔的寫法：

startingSalary_industrial_sub %>%
  ggplot()+
  geom_col(
    aes(x=reorder(大職業別,`經常性薪資-薪資`,order = T),y=`經常性薪資-薪資`)
  ) -> graphList$經常薪資_x有排序ed_col1

graphList$經常薪資_x有排序ed_col1

作法2

作法2: 使用scale_x_...中的limits設定調整。

breaks_order <- levels(startingSalary_industrial_sub$大職業別ordered)
startingSalary_industrial_sub %>%
  ggplot()+
  geom_col(
    aes(x=大職業別,y=`經常性薪資-薪資`)
  )+
  scale_x_discrete(
    limits=breaks_order
  ) -> graphList$經常薪資_x有排序ed_scaleLimits_col0

graphList$經常薪資_x有排序ed_scaleLimits_col0

改變width

startingSalary_industrial_sub %>%  
  ggplot(aes(x=大職業別ordered,y=`經常性薪資-薪資`))+
  geom_col(width=0.7)+
  scale_x_discrete(
    limits=breaks_order
  ) -> graphList$經常薪資_x有排序ed_scaleLimits_geomWidth_col0

graphList$經常薪資_x有排序ed_scaleLimits_geomWidth_col0

上面我們將aes(x=大職業別ordered,y=經常性薪資-薪資)由geom_col()移到ggplot()，這樣在後面進行layer疊加時，若使用相同aes可以省略不寫。

也可以先建立一個基本ggplot方便後面疊加

startingSalary_industrial_sub %>%  
  ggplot(aes(x=大職業別ordered,y=`經常性薪資-薪資`))->
  graphList$經常薪資_x有排序ed_ggplotOnly

改變高寬比例aspect.ratio

graphList$經常薪資_x有排序ed_scaleLimits_geomWidth_col0+
  theme(aspect.ratio = 1/1.3) -> 
  graphList$經常薪資_x有排序_scalLimits_gmWidth_asp0_col0

graphList$經常薪資_x有排序_scalLimits_gmWidth_asp0_col0

X軸字體重疊在一起可以透過theme() layer去調整axis.text.x值:

theme(axis.text.x= <參數值> )
其中<參數值>設定為一個list元素，其內容設定複雜，一般透過element_text()函數來生成。
- element_text(angle=... , hjust=... , vjust=...)

graphList$經常薪資_x有排序ed_ggplotOnly +
  geom_col(width=0.5) +
  scale_x_discrete(limits=breaks_order)+
  theme(aspect.ratio = 1)-> 
  graphList$經常薪資_x有排序_scalLimits_gmWidth_asp1_col0  

graphList$經常薪資_x有排序_scalLimits_gmWidth_asp1_col0

字轉45度

graphList$經常薪資_x有排序_scalLimits_gmWidth_asp1_col0 +
  theme(
    axis.text.x=element_text(angle=45)
    )

字轉45度，水平調整為1

graphList$經常薪資_x有排序_scalLimits_gmWidth_asp1_col0 +
  theme(
    axis.text.x=element_text(angle=45, hjust=1)
    ) -> graphList$經常薪資_x有排序_scalLimits_Width_asp_textAdj_col0

graphList$經常薪資_x有排序_scalLimits_Width_asp_textAdj_col0

座標旋轉coord_flip

graphList$經常薪資_x有排序_scalLimits_gmWidth_asp1_col0 +
  coord_flip() -> graphList$經常薪資_x有排序_sclLimits_width_asp_flip_col0

graphList$經常薪資_x有排序_sclLimits_width_asp_flip_col0

請試著產生如下圖形：

長條圖顏色: #5A99B3
文字使用geom_text(aes(x=..,y=..,label=...))並更改color及nudge_y（可以有正負值，表示文字所放位置所對應y值要+或-多少）

4.1.3 geom_bar

aes y mapping是由geom_bar去呼叫stat_count函數計算count(數個數)。

4.1.3.1 圖書借閱資料

library(readr)
library2014 <- read_csv("https://www.dropbox.com/s/999hy0u1y98y6ep/library2014.csv?dl=1")

資料整理：2014-09-01到2015-06-30間資料

library(readr)
library100_102 <- read_csv("https://www.dropbox.com/s/wuo5o6l55lk68l6/library100_102.csv?dl=1")

library100_102 %>%
  mutate(
    借閱日期=date(ymd_hms(借閱時間)),
    借閱年=year(借閱日期)
  ) -> library100_102

library100_102 %>%
  filter(
    借閱日期 %>% between(ymd("2014-09-01"),ymd("2015-06-30"))
  ) -> library2014  

library2014 %>%
  group_by(學號) %>%
  summarise(
    學院=last(學院),
    讀者年級=max(讀者年級)
  ) %>%
  ungroup() %>%
  mutate(
    讀者年級=讀者年級
    )-> library2014

library2014 %>%
  mutate(
    學院=reorder(學院,學號,length,order=T),
    讀者年級=reorder(讀者年級,讀者年級, order=T)
  ) -> library2014

library2014 %>% 
  ggplot()-> graphList$圖書_ggplotOnly

graphList$圖書_ggplotOnly+
  geom_bar(
    aes(x=學院), fill="#5A99B3", width=0.7
    )

graphList$圖書_ggplotOnly +
  geom_bar(
    aes(x=學院,fill=讀者年級), width=0.7
  )

試著做出類似下圖：

4.1.4 連續變數

直方圖的另一個常見用法是將連續變數：

（一）先切成一段段不重疊的數值區間：稱為binning，每個區間稱為bin。

（二）以每個bin為長條圖x軸的類別變數進行作圖

set.seed(2019)
x <- rnorm(100)
head(x)

[1] 0.7385 -0.5148 -1.6402 0.9160 -1.2675 0.7382

ggplot2::cut_interval(x,n=8) -> x_interval
levels(x_interval)

[1] “[-2.26,-1.65]” “(-1.65,-1.04]” “(-1.04,-0.426]” [4] “(-0.426,0.186]” “(0.186,0.799]” “(0.799,1.41]”
[7] “(1.41,2.02]” “(2.02,2.64]”

head(x_interval)

[1] (0.186,0.799] (-1.04,-0.426] (-1.65,-1.04] [4] (0.799,1.41] (-1.65,-1.04] (0.186,0.799] 8 Levels: [-2.26,-1.65] … (2.02,2.64]

ggplot2::cut_interval(x,n=8): 將連續資料x分成n個區間，並將x值各別對應該所屬區間（形成x_interval）

df_x <- data.frame(
  x=x,
  x_interval=x_interval
)

df_x %>%
  group_by(x_interval) %>%
  summarise(
    interval_count=n()
  ) %>%
  ungroup() %>% #View
  ggplot(aes(x=x_interval))+
  geom_col(
    aes(y=interval_count)
  )

4.1.5 geom_histogram

df_x %>%
  ggplot(aes(x=x))+
  geom_histogram(bins=8)

「geom_bar, geom_col」和geom_historgram最大的不同是長條間有沒有留空隙。連續型x變數應使用geom_histogram以正確保留其連續意涵。

4.1.6 optimal bins

原則上「樣本越大」、「資料越集中」則bin數目越多。有不少決定bins或binwidth的公式，大致上大同小異。這裡我們使用grDevices::nclass.FD(), 依Freedman-Diaconis法則選bins數。

optimBins <- grDevices::nclass.FD(df_x$x)
optimBins

[1] 10

df_x %>%
  ggplot(aes(x=x))+
  geom_histogram(bins=optimBins)

4.2 stat function

以stat_<計算方式>命名的函數。
- 會從required aesthetics計算結果，產生computed variables.
- 本身會自成一個layer，用geom=...來設定此layer的幾何圖像。
- 用aes(....)來設定此layer的mapping及required aesthetics。

4.2.1 一體兩面stat/geom

stat_count與geom_bar

geom_bar(…, stat=“count”)
stat_count(…, geom=“bar”)

stat_count(): it counts the number of cases at each x position.
- required aesthetics: x; computed variables: count, prop.

使用geom_bar

graphList$圖書_ggplotOnly+
  geom_bar(
    aes(x=學院) # 預設 stat="count"
  )

# 用geom_bar來畫，y值由stat_count依x值來進行計算。

使用stat_count

graphList$圖書_ggplotOnly +
  stat_count(
    aes(x=學院), 
    geom="bar" # geom_bar內訂會取 count來當y
  )

要取出compute variable可以用stat(<compute variable>)或..<compute variable>..

graphList$圖書_ggplotOnly +
  stat_count(
    aes(x=學院,y=stat(prop)), 
    geom="bar" # geom_bar內訂會取 count來當y
  )

4.2.2 override geom

stat_count與geom_bar只是在預設值下為一體兩面，實際上可以更改預設值做更有彈性使用，例如：

stat_count(
    aes(x=學院,y=stat(count),label=stat(count)), geom="text"
  )

將stat_count的結果結合出aes(x=學院,y=stat(count),label=stat(count)) mapping用來進行geom_text繪製。

graphList$圖書_ggplotOnly +
  geom_bar(
    aes(x=學院)
  )+
  stat_count(
    aes(x=學院,y=stat(count),label=stat(count)), geom="text"
  )

改成比例使用prop這個computed value.

graphList$圖書_ggplotOnly +
  geom_bar(
    aes(x=學院,y=stat(prop))
  )+
  stat_count(
    aes(x=學院,y=stat(prop),label=stat(prop)), geom="text"
  )

stat_count的prop定義為groupwise proportion：

內定group=x，此時會去算不同x值在不同x群的比例，最後只有1或0（0圖面看不出來）
要算在全體資料下的prop可以設定group=某固定值（值的type不重要，重點是固定）。

graphList$圖書_ggplotOnly +
  geom_bar(
    aes(x=學院, y=stat(prop), group="全校") 
  )+
  stat_count(
    aes(
      x=學院,
      y=stat(prop), group="全校",
      label=round(stat(prop),digits=2)),
    geom="text",
    color="white", nudge_y=-0.01
  )+
  labs(
    title="各院借閱人次佔總人次比",x="",y=""
  )

4.2.3 override stat

雖然任何layer都可以由geom或stat產生，但兩者是不同的object，geom是geometric object而stat是statistical transformation object。有些設定只對某一個object type有效，如geom_text底下的nudge。

前面的圖在stat_count下無法進行nudge_y設定，而只有geom_text才可以。我們可以改使用geom_text，但override它的stat（內定為“identity”）。

graphList$圖書_ggplotOnly +
  geom_bar(
    aes(x=學院, y=stat(prop), group="全校") 
  )+
  geom_text(
    aes(
      x=學院,
      y=stat(prop), group="全校",
      label=round(stat(prop),digits=2)),
    stat="count",
    color="white",nudge_y=-0.01
  )+
  labs(
    title="各院借閱人次佔總人次比",x="",y=""
  )

請以df_x完成下圖（bins=8）：

4.2.4 自創stat函數

This stat makes it easy to superimpose a function on top of an existing plot. The function is called with a grid of evenly spaced values along the x axis, and the results are drawn (by default) with a line.

stat_function(mapping = NULL, data = NULL, geom = "path",
  position = "identity", ..., fun, xlim = NULL, n = 101,
  args = list(), na.rm = FALSE, show.legend = NA,
  inherit.aes = TRUE)

會從x軸等距切出許多x值，代到fun設定去計算結果，computed variables: x及y.
說明：https://ggplot2.tidyverse.org/reference/stat_function.html
fun：函數名稱
n: x軸切多少點
args: List of additional arguments to pass to fun

4.2.4.1 常態分配

df_x <- data.frame(
  x=rnorm(500,mean=2,sd=1)
)
df_x %>%
  ggplot(aes(x=x))+
  geom_histogram(
    aes(y=stat(density))
  )+
  stat_function(
    fun=dnorm, args = list(mean=2, sd=1) # dnorm 為常態分配density函數
  )

4.2.4.2 市場供需

market_demand <- function(x, a=1, b=-1){
  return(a+b*x)
}

market_supply <- function(x, a=0, b=1){
  return(a+b*x)
}

df_grids <- 
  data.frame(
    x=seq(0,1,by=0.2))

df_grids %>%
  ggplot(aes(x=x))+
  stat_function(
    fun="market_demand"
  )+
  stat_function(
    fun="market_supply"
  )+
  scale_y_continuous(
    limits = c(0,1)
  )+
  labs(
    x="Quantity", y="Price"
  ) -> graphList$demand_supply

graphList$demand_supply

graphList$demand_supply+
  stat_function(
    fun="market_demand",
    args=list(b=-2), color='red'
  )

4.2.5 stat_summary

stat_summary/stat_summary_bin:

將每個 x 或 binned x下的所有「y值丟到指定函數」去計算圖面所要的y, ymin, ymax三個aes mapping.
無computed variables to call, 會直接設定y, ymin, ymax三個mapping。

stat_summary_2d/stat_summary_hex:

將每個 x,y下的所有「z值丟到指定函數」去計算圖面所要的summary statistics.
computed variables: value (即summary statistics值)

在graphList$demand_supply加上如下的total welfare陰影。（使用geom_ribbon及stat_summary）

graphList$demand_supply+
  geom_ribbon(
    data=data.frame(
      x=seq(0.25,0.5,by=0.01)
    ),
    aes(y=x),
    stat="summary", 
    fun.ymin= function(y) market_demand(y),
    fun.ymax= function(y) market_supply(y),
    alpha=0.3
  )

d <- ggplot(diamonds, aes(carat, price))
d + geom_point()

d + geom_hex(
  aes(fill=stat(density))
)

4.3 Positions

所有的geom都有position設定，如：geom_bar(position="stack")。

4.3.1 stack

stack：疊上
使用position="stack"或position=position_stack(...)設定——後者有更多調整彈性。

if(!require(devtools)) install.packages("devtools")
devtools::install_github("kassambara/ggpubr")

df_position <- data.frame(
  x=rep(c("a","b"), each=3),
  y=c(3,1,3,8,6,10)
)

df_position %>%
  ggplot(aes(x=x,y=y))+
  geom_point(
    color="#5A99B3"
  ) + 
  scale_y_continuous(
    breaks=c(1,3,6,8,10)
  )+
  annotate(
    geom="text",
    x=1.1, y=3, label="x 2" # 利用factor的type為integer的特質設x位置
  )+
  labs(
    title="Position identity",
    subtitle="Position沒有調整"
    ) -> graphList$position_none

df_position %>%
  ggplot(aes(x=x,y=y,color=y))+
  geom_point(
    position="stack", color="#5A99B3"
  )+
  labs(
    title= "Position stack",
    subtitle = "各x類y值疊加上去" 
  )-> graphList$position_stack

ggpubr::ggarrange(
  graphList$position_none,
  graphList$position_stack
)

4.3.2 fill

fill：填滿
相同x值下有多個y值時（標準化成同高度，呈現比重變化用）
使用position="fill"或position=position_fill(...)設定，後者有更多調整彈性。

df_position %>%
  ggplot(aes(x=x,y=y,color=y))+
  geom_point(
    position="fill", color="#5A99B3"
  )+
  labs(
    title= "Position fill",
    subtitle = "各x類y值縮放同比例使加總為1" 
  )-> graphList$position_fill

ggpubr::ggarrange(
  graphList$position_none,
  graphList$position_fill
)

4.3.3 dodge

dodge：躲避
在不改變vertical position下，調整horizontal position使geom不重疊。
使用position="dodge"或position=position_dodge(...)設定，後者有更多調整彈性。

df_position %>%
  ggplot(aes(x=x,y=y))+
  geom_point(
    color="#5A99B3", alpha=0.3, size=4
  )+
  geom_point(
    position=position_dodge2(width=0.3), color="#5A99B3"
  )+
  labs(
    title= "Position dodge",
    subtitle = "淺色大圈為原始資料，\n深色小圈為position調整後" # \n 為換行符號
  )-> graphList$position_dodge

ggpubr::ggarrange(
graphList$position_none,
graphList$position_dodge
)

4.3.4 geom_bar應用

position_dodge

比較各大項內的小項差異
也比較各小項在各大項內的差異

library2014 %>%
  ggplot(aes(x=學院,fill=讀者年級)) -> graphList$圖書_ggplotOnly_with_aes

graphList$圖書_ggplotOnly_with_aes +
  geom_bar() # 內定position="stack"

graphList$圖書_ggplotOnly_with_aes +
  geom_bar(
    width=0.7, position = "dodge"
  ) -> # fill類別較少的,width會放大
  graphList$圖書_positionDodge

graphList$圖書_positionDodge

使用position_dodge()微調

graphList$圖書_ggplotOnly_with_aes +
  geom_bar(
    width=0.7,
    position=position_dodge(width=0.8,preserve = "single") # 0.7寬，0.8dodge創造0.1間隔
  ) ->
  graphList$圖書_positionDodge_preserve

graphList$圖書_positionDodge_preserve

電機、法律學院沒有1年級資料，count時會少一類並不會計0，造成圖面的不一致。此時建議自行count並以stat=“identity”方式作圖：

library2014 %>%
  select(學院,讀者年級) %>%
  arrange(學院,讀者年級) %>%
  group_by(學院,讀者年級) %>%
  summarise(
    count=n()
  ) %>% ungroup() %>%
  add_row(
    學院=c("電機資訊學院","法律學院"),
    讀者年級=c(1,1),
    count=c(0,0)
  ) -> library2014count

library2014count %>%
  ggplot(aes(x=學院,fill=讀者年級)) -> graphList$圖書_ggplotOnly_with_aes2

graphList$圖書_ggplotOnly_with_aes2 +
  geom_bar(
    aes(y=count), ##### 指定高度y
    stat="identity", ##### 自行設定y時使用
    width=0.7,
    position=position_dodge(width=0.8) # 0.7寬，0.8dodge創造0.1間隔, preserve = "single"在類別齊全時可不用
  ) ->
  graphList$圖書_positionDodge2

graphList$圖書_positionDodge2

請使用position_dodge適當修改下圖數字位置：

graphList$圖書_positionDodge2+
  geom_text(
    aes(y=count, label=count)
  ) -> graphList$圖書_positionDodge2_textExample

graphList$圖書_positionDodge2_textExample

當geom有使用stat函數時，其計算後用來作圖的資料可用ggplot_build()產生：

存在產生list物件的data元素裡，data是個list，其數字代表圖面對應layer層次。

graphList$圖書_positionDodge_preserve %>%
  ggplot_build() -> graphList$圖書_positionDodge_build

graphList$圖書_positionDodge2_textExample %>%
  layer_data(i=2) %>%
  head()

   fill   y x label PANEL group colour size angle

1 #31688EFF 52 1 52 1 2 black 3.88 0 2 #35B779FF 53 1 53 1 3 black 3.88 0 3 #FDE725FF 22 1 22 1 4 black 3.88 0 4 #440154FF 1 2 1 1 5 black 3.88 0 5 #31688EFF 100 2 100 1 6 black 3.88 0 6 #35B779FF 124 2 124 1 7 black 3.88 0 hjust vjust alpha family fontface lineheight 1 0.5 0.5 NA 1 1.2 2 0.5 0.5 NA 1 1.2 3 0.5 0.5 NA 1 1.2 4 0.5 0.5 NA 1 1.2 5 0.5 0.5 NA 1 1.2 6 0.5 0.5 NA 1 1.2

position_stack

強調各大類總額差別
及各大類總額的次類組成份子大小

graphList$圖書_ggplotOnly_with_aes2 +
  geom_bar(
    stat="identity",
    aes(y=count), width=0.7, position="stack"
  )

graphList$圖書_ggplotOnly_with_aes2 +
  geom_bar(
    stat="identity",
    aes(y=count), width=0.7, position="stack"
  )+
  geom_text(
    aes(x=學院,label=count,y=count),
    position=position_stack(vjust=0.5),
    color="white"
  )

使用geom_label可以較清楚標示：

graphList$圖書_ggplotOnly_with_aes2 +
  geom_bar(
    stat="identity",
    aes(y=count), width=0.7, position="stack"
  )+
  geom_label(
    aes(x=學院,label=count,y=count),
    position=position_stack(vjust=0.5),
    color="white"
  )

#### position_fill{-}

強調組成份子比例的變化。

graphList$圖書_ggplotOnly_with_aes2 +
  geom_bar(
    stat="identity",
    aes(y=count), width=0.7, position="fill"
  ) + 
  geom_text(
    aes(x=學院,label=count,y=count),
    position=position_fill(vjust=0.5),
    color="white"
  )

4.3.5 其他position

df <- data.frame(
  x = c(1,3,2,5),
  y = c("a","c","d","c")
)
ggplot(df, aes(x, y)) +
  geom_point() +
  geom_text(aes(label = y), position = position_nudge(y = -0.1))

ggplot(mpg, aes(class, hwy)) +
  geom_boxplot(colour = "grey50") +
  geom_point(position=position_jitter())

dsub <- diamonds[ sample(nrow(diamonds), 1000), ]
ggplot(dsub, aes(x = cut, y = carat, fill = clarity)) +
  geom_boxplot(outlier.size = 0) +
  geom_point(pch = 21, position = position_jitterdodge())