Chapter 22: R

22.1 TonyKuoYJ

郭耀仁認識 R 的美好

https://bookdown.org/tonykuoyj/eloquentr/getting-started.html

https://bookdown.org/tonykuoyj/eloquentr/easy-installation.html#about-packages

install.pacakges()

library()

https://bookdown.org/tonykuoyj/eloquentr/getting-started.html

22.1.1 quick intro

Ctrl + Alt + I to insert a new code chunk in RStudio

Ctrl + Enter to run the current line

Ctrl + Shift + Enter to run the current chunk

R.version

##                _                                
## platform       x86_64-w64-mingw32               
## arch           x86_64                           
## os             mingw32                          
## crt            ucrt                             
## system         x86_64, mingw32                  
## status                                          
## major          4                                
## minor          2.1                              
## year           2022                             
## month          06                               
## day            23                               
## svn rev        82513                            
## language       R                                
## version.string R version 4.2.1 (2022-06-23 ucrt)
## nickname       Funny-Looking Kid

a <- 23 # prime
a

## [1] 23

combine <- c(11, 13) # twin prime
combine

## [1] 11 13

# ?c
# help(c)

Ctrl + L to clean R console

path with slash / in R, differing backslash \ in M$ Windows

22.1.1.1 function

add <- function(x, y) {
  return(x + y)
}

add(11, 13)

## [1] 24

$BMI = \dfrac{BW\left[\text{Kg}\right]}{{BH\left[\text{m}\right]}^2}$

get_bmi <- function (bw, bh) {
  return (bw/(bh/100)^2)
}

get_bmi(70, 170)

## [1] 24.22145

22.1.2 R style

https://bookdown.org/tonykuoyj/eloquentr/styleguide.html

snake_case rather than camelCase

22.1.3 data workflow or forward pipe

from chaining method in object-oriented programming to functional programming

22.1.3.1 `%>%` operator

abs(-5:5)

##  [1] 5 4 3 2 1 0 1 2 3 4 5

# install.packages("magrittr")

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked _by_ '.GlobalEnv':
## 
##     add

-5:5 %>% abs()

##  [1] 5 4 3 2 1 0 1 2 3 4 5

# with readability but too many lines
sys_date <- Sys.Date()
sys_date_yr <- format(sys_date, format = "%Y")
sys_date_num <- as.numeric(sys_date_yr)
sys_date_num

## [1] 2025

# less line but also less readability
sys_date_num <- as.numeric(format(Sys.Date(), format = "%Y"))
sys_date_num

## [1] 2025

# use %>% operator to demonstrate data workflow or forward pipe
sys_date_num <- Sys.Date() %>%
   format(format = "%Y") %>%
   as.numeric()
sys_date_num

## [1] 2025

22.1.4 data processing with `dplyr`

https://bookdown.org/tonykuoyj/eloquentr/dplyr.html

some functions functioning like those in SQL

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# install.packages("gapminder")

library(gapminder)

## Warning: package 'gapminder' was built under R version 4.2.3

head(gapminder)

## # A tibble: 6 × 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.

library(gapminder)
library(dplyr)
library(magrittr)

gapminder %>%
  filter(year == 2007)

## # A tibble: 142 × 6
##    country     continent  year lifeExp       pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>     <int>     <dbl>
##  1 Afghanistan Asia       2007    43.8  31889923      975.
##  2 Albania     Europe     2007    76.4   3600523     5937.
##  3 Algeria     Africa     2007    72.3  33333216     6223.
##  4 Angola      Africa     2007    42.7  12420476     4797.
##  5 Argentina   Americas   2007    75.3  40301927    12779.
##  6 Australia   Oceania    2007    81.2  20434176    34435.
##  7 Austria     Europe     2007    79.8   8199783    36126.
##  8 Bahrain     Asia       2007    75.6    708573    29796.
##  9 Bangladesh  Asia       2007    64.1 150448339     1391.
## 10 Belgium     Europe     2007    79.4  10392226    33693.
## # ℹ 132 more rows

library(gapminder)
library(dplyr)
library(magrittr)

gapminder %>%
  filter(year == 2007) %>%
  select(country)

## # A tibble: 142 × 1
##    country    
##    <fct>      
##  1 Afghanistan
##  2 Albania    
##  3 Algeria    
##  4 Angola     
##  5 Argentina  
##  6 Australia  
##  7 Austria    
##  8 Bahrain    
##  9 Bangladesh 
## 10 Belgium    
## # ℹ 132 more rows

library(gapminder)
library(dplyr)
library(magrittr)

gapminder %>%
  mutate(pop_in_thousands = pop / 1000)

## # A tibble: 1,704 × 7
##    country     continent  year lifeExp      pop gdpPercap pop_in_thousands
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>            <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.            8425.
##  2 Afghanistan Asia       1957    30.3  9240934      821.            9241.
##  3 Afghanistan Asia       1962    32.0 10267083      853.           10267.
##  4 Afghanistan Asia       1967    34.0 11537966      836.           11538.
##  5 Afghanistan Asia       1972    36.1 13079460      740.           13079.
##  6 Afghanistan Asia       1977    38.4 14880372      786.           14880.
##  7 Afghanistan Asia       1982    39.9 12881816      978.           12882.
##  8 Afghanistan Asia       1987    40.8 13867957      852.           13868.
##  9 Afghanistan Asia       1992    41.7 16317921      649.           16318.
## 10 Afghanistan Asia       1997    41.8 22227415      635.           22227.
## # ℹ 1,694 more rows

library(gapminder)
library(dplyr)
library(magrittr)

gapminder %>%
  arrange(year)

## # A tibble: 1,704 × 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Albania     Europe     1952    55.2  1282697     1601.
##  3 Algeria     Africa     1952    43.1  9279525     2449.
##  4 Angola      Africa     1952    30.0  4232095     3521.
##  5 Argentina   Americas   1952    62.5 17876956     5911.
##  6 Australia   Oceania    1952    69.1  8691212    10040.
##  7 Austria     Europe     1952    66.8  6927772     6137.
##  8 Bahrain     Asia       1952    50.9   120447     9867.
##  9 Bangladesh  Asia       1952    37.5 46886859      684.
## 10 Belgium     Europe     1952    68    8730405     8343.
## # ℹ 1,694 more rows

total population in the world in 2007

library(gapminder)
library(dplyr)
library(magrittr)

gapminder %>%
  filter(year == 2007) %>%
  summarise(ttl_pop = sum(as.numeric(pop)))

## # A tibble: 1 × 1
##      ttl_pop
##        <dbl>
## 1 6251013179

total population group by the continents in 2007

library(gapminder)
library(dplyr)
library(magrittr)

gapminder %>%
  filter(year == 2007) %>%
  group_by(continent) %>%
  summarise(ttl_pop = sum(as.numeric(pop)))

## # A tibble: 5 × 2
##   continent    ttl_pop
##   <fct>          <dbl>
## 1 Africa     929539692
## 2 Americas   898871184
## 3 Asia      3811953827
## 4 Europe     586098529
## 5 Oceania     24549947

22.1.5 visualization statically with `ggplot2`

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.2.3

library(gapminder)

gapminder_2007 <- gapminder %>%
  filter(year == 2007)
scatter_plot <- ggplot(gapminder_2007, aes(x = gdpPercap, y = lifeExp)) +
  geom_point()
scatter_plot

library(ggplot2)
library(gapminder)

north_asia <- gapminder %>%
  filter(country %in% c("China", "Japan", "Taiwan", "Korea, Rep."))
line_plot <- ggplot(north_asia, aes(x = year, y = gdpPercap, colour = country)) +
  geom_line()
line_plot

library(ggplot2)
library(gapminder)

hist_plot <- ggplot(gapminder_2007, aes(x = gdpPercap)) +
  geom_histogram()
hist_plot

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

hist_plot <- ggplot(gapminder_2007, aes(x = gdpPercap)) +
  geom_histogram(bins = 20)
hist_plot

library(ggplot2)
library(gapminder)

box_plot <- ggplot(gapminder_2007, aes(x = continent, y = gdpPercap)) +
  geom_boxplot()
box_plot

library(ggplot2)
library(gapminder)

gdpPercap_2007_na <- gapminder %>%
  filter(year == 2007 & country %in% c("China", "Japan", "Taiwan", "Korea, Rep."))
bar_plot <- ggplot(gdpPercap_2007_na, aes(x = country, y = gdpPercap)) +
  geom_bar(stat = "identity")
bar_plot

22.1.6 loop

https://bookdown.org/tonykuoyj/eloquentr/for.html

month.name

##  [1] "January"   "February"  "March"     "April"     "May"       "June"     
##  [7] "July"      "August"    "September" "October"   "November"  "December"

month.name[1]

## [1] "January"

for (month in month.name) {
  print(month)
}

## [1] "January"
## [1] "February"
## [1] "March"
## [1] "April"
## [1] "May"
## [1] "June"
## [1] "July"
## [1] "August"
## [1] "September"
## [1] "October"
## [1] "November"
## [1] "December"

22.1.7 variable type

https://bookdown.org/tonykuoyj/eloquentr/variable-types.html

https://www.w3schools.com/r/r_data_types.asp

numeric
integer
complex = complex number
character
logical = boolean

class(2L)

## [1] "integer"

class(2.0L)

## [1] "integer"

class(2.3L)

## [1] "numeric"

time: POSIXct POSIXt

class(Sys.time())

## [1] "POSIXct" "POSIXt"

0 %in% -5:5

## [1] TRUE

22.1.7.1 date

1970-01-01 = 0L

date_of_origin <- as.Date("1970-01-01")
as.integer(date_of_origin)

## [1] 0

check if type of x is Date

inherits(x, what = "Date")

convert character to Date

as.Date("01-01-1970", format = "%m-%d-%Y")

22.1.7.2 time

1970-01-01 00:00:00 GMT = 0L

tz = time zone

time_of_origin <- as.POSIXct("1970-01-01 00:00:00", tz = "GMT")
as.integer(time_of_origin)

## [1] 0

check if type of x is time

inherits(x, what = "POSIXct")

convert character to time

as.POSIXct("1970-01-01 00:00:00", tz = "GMT")

22.1.7.3 quotient `%/%` operator

https://www.w3schools.com/r/r_operators.asp

7 %/% 3

## [1] 2

22.1.8 data type

https://bookdown.org/tonykuoyj/eloquentr/vector-factor.html

1D
- vector^[22.1.8.1]
- factor^[22.1.8.2]
2D
- matrix^[22.1.8.3]
- data frame^[22.1.8.4]
$n$ D
- array^[22.2.6.1]
- list^[22.2.6.2]

22.1.8.1 vector

four_seasons <- c("spring", "summer", "autumn", "winter")
four_seasons

## [1] "spring" "summer" "autumn" "winter"

favorite_season <- four_seasons[3]
favorite_season

## [1] "autumn"

favorite_seasons <- four_seasons[c(-2, -4)]
favorite_seasons

## [1] "spring" "autumn"

only one variable type for a vector

lucky_numbers <- c(7L, 24)
class(lucky_numbers[1])

## [1] "numeric"

lucky_numbers <- c(7L, FALSE)
lucky_numbers

## [1] 7 0

class(lucky_numbers[2])

## [1] "integer"

mixed_vars <- c(TRUE, 7L, 24, "spring")
mixed_vars

## [1] "TRUE"   "7"      "24"     "spring"

class(mixed_vars[1])

## [1] "character"

class(mixed_vars[2])

## [1] "character"

class(mixed_vars[3])

## [1] "character"

22.1.8.1.1 logic

four_seasons <- c("spring", "summer", "autumn", "winter")
my_favorite_seasons <- four_seasons == "spring" | four_seasons == "autumn"
four_seasons[my_favorite_seasons]

## [1] "spring" "autumn"

22.1.8.1.2 `rep` repeat

rep(7L, times = 8)

## [1] 7 7 7 7 7 7 7 7

rep("R", times = 10)

##  [1] "R" "R" "R" "R" "R" "R" "R" "R" "R" "R"

22.1.8.1.3 `seq` sequence

seq(from = 7, to = 77, by = 7)

##  [1]  7 14 21 28 35 42 49 56 63 70 77

11:20

##  [1] 11 12 13 14 15 16 17 18 19 20

22.1.8.2 factor

https://bookdown.org/tonykuoyj/eloquentr/vector-factor.html#factor

four_seasons <- c("spring", "summer", "autumn", "winter")
four_seasons

## [1] "spring" "summer" "autumn" "winter"

four_seasons_factor <- factor(four_seasons)
four_seasons_factor

## [1] spring summer autumn winter
## Levels: autumn spring summer winter

four_seasons <- c("spring", "summer", "autumn", "winter")
four_seasons_factor <- factor(four_seasons, ordered = TRUE, levels = c("summer", "winter", "spring", "autumn"))
four_seasons_factor

## [1] spring summer autumn winter
## Levels: summer < winter < spring < autumn

temperatures <- c("warm", "hot", "cold")
temp_factors <- factor(temperatures, ordered = TRUE, levels = c("cold", "warm", "hot"))
temp_factors

## [1] warm hot  cold
## Levels: cold < warm < hot

if no levels specified, the levels will be specified alphabetically, sometimes not really true

temperatures <- c("warm", "hot", "cold")
temp_factors <- factor(temperatures, ordered = TRUE)
temp_factors

## [1] warm hot  cold
## Levels: cold < hot < warm

22.1.8.3 matrix

https://bookdown.org/tonykuoyj/eloquentr/matrix-dataframe-more.html

my_mat <- matrix(1:6, nrow = 2)
my_mat

##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6

class(my_mat)

## [1] "matrix" "array"

my_mat2 <- matrix(1:6, nrow = 2, byrow = TRUE)
my_mat2

##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6

my_mat2[2, 3]

## [1] 6

my_mat2[2, ]

## [1] 4 5 6

my_mat2[, 3]

## [1] 3 6

filter <- my_mat2 < 6 & my_mat2 > 1
my_mat2[filter]

## [1] 4 2 5 3

boolean will become value in a matrix, like vector

my_mat3 <- matrix(c(1, 2, TRUE, FALSE, 3, 4), nrow = 2)
my_mat3

##      [,1] [,2] [,3]
## [1,]    1    1    3
## [2,]    2    0    4

class(my_mat3[, 2])

## [1] "numeric"

22.1.8.4 data frame

variable: column
observation: row
value: cell

team_name <- c("Chicago Bulls", "Golden State Warriors")
wins <- c(72, 73)
losses <- c(10, 9)
is_champion <- c(TRUE, FALSE)
season <- c("1995-96", "2015-16")

great_nba_teams <- data.frame(team_name, wins, losses, is_champion, season)
great_nba_teams

##               team_name wins losses is_champion  season
## 1         Chicago Bulls   72     10        TRUE 1995-96
## 2 Golden State Warriors   73      9       FALSE 2015-16

great_nba_teams[1, 1]

## [1] "Chicago Bulls"

great_nba_teams[1, ]

##       team_name wins losses is_champion  season
## 1 Chicago Bulls   72     10        TRUE 1995-96

great_nba_teams[, 1]

## [1] "Chicago Bulls"         "Golden State Warriors"

stringsAsFactors = TRUE

team_name <- c("Chicago Bulls", "Golden State Warriors")
wins <- c(72, 73)
losses <- c(10, 9)
is_champion <- c(TRUE, FALSE)
season <- c("1995-96", "2015-16")
 
great_nba_teams <- data.frame(team_name, wins, losses, is_champion, season, stringsAsFactors = TRUE)
great_nba_teams[, 1]

## [1] Chicago Bulls         Golden State Warriors
## Levels: Chicago Bulls Golden State Warriors

stringsAsFactors = FALSE

team_name <- c("Chicago Bulls", "Golden State Warriors")
wins <- c(72, 73)
losses <- c(10, 9)
is_champion <- c(TRUE, FALSE)
season <- c("1995-96", "2015-16")
 
great_nba_teams <- data.frame(team_name, wins, losses, is_champion, season, stringsAsFactors = FALSE)
great_nba_teams[, 1]

## [1] "Chicago Bulls"         "Golden State Warriors"

22.1.8.4.1 selecting variable or column

great_nba_teams$team_name

## [1] "Chicago Bulls"         "Golden State Warriors"

great_nba_teams[, "team_name"]

## [1] "Chicago Bulls"         "Golden State Warriors"

22.1.8.4.2 filtering observation or row

filter <- great_nba_teams$is_champion == TRUE
great_nba_teams[filter, ]

##       team_name wins losses is_champion  season
## 1 Chicago Bulls   72     10        TRUE 1995-96

22.1.8.4.3 check mixed data type

str(great_nba_teams)

## 'data.frame':    2 obs. of  5 variables:
##  $ team_name  : chr  "Chicago Bulls" "Golden State Warriors"
##  $ wins       : num  72 73
##  $ losses     : num  10 9
##  $ is_champion: logi  TRUE FALSE
##  $ season     : chr  "1995-96" "2015-16"

22.2 W3School

https://www.w3schools.com/r/default.asp

22.2.1 same multiple variable

https://www.w3schools.com/r/r_variables_multiple.asp

# Assign the same value to multiple variables in one line
var1 <- var2 <- var3 <- "Orange"

# Print variable values
var1

## [1] "Orange"

var2

## [1] "Orange"

var3

## [1] "Orange"

22.2.2 legal variable name

https://www.w3schools.com/r/r_variables_name.asp

# Legal variable names:
myvar <- "John"
my_var <- "John"
myVar <- "John"
MYVAR <- "John"
myvar2 <- "John"
.myvar <- "John"

## Illegal variable names:
# 2myvar <- "John"
# my-var <- "John"
# my var <- "John"
# _my_var <- "John"
# my_v@ar <- "John"
# TRUE <- "John"

22.2.3 complex number

https://www.w3schools.com/r/r_data_types.asp

https://www.w3schools.com/r/r_numbers.asp

22.2.4 escape character

https://www.w3schools.com/r/r_strings_esc.asp

22.2.5 global assignment `<<-`

my_function <- function() {
txt <<- "fantastic"
  paste("R is", txt)
}

my_function()

## [1] "R is fantastic"

print(txt)

## [1] "fantastic"

txt <- "awesome"
my_function <- function() {
  txt <<- "fantastic"
  paste("R is", txt)
}

my_function()

## [1] "R is fantastic"

paste("R is", txt)

## [1] "R is fantastic"

22.2.6 data type

22.2.6.1 array

https://www.w3schools.com/r/r_arrays.asp

# An array with one dimension with values ranging from 1 to 24
thisarray <- c(1:24)
thisarray

##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24

# An array with more than one dimension
multiarray <- array(thisarray, dim = c(4, 3, 2))
multiarray

## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    5    9
## [2,]    2    6   10
## [3,]    3    7   11
## [4,]    4    8   12
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]   13   17   21
## [2,]   14   18   22
## [3,]   15   19   23
## [4,]   16   20   24

multiarray[2, 3, 2]

## [1] 22

22.2.6.2 list

https://www.w3schools.com/r/r_lists.asp

22.3 Apan Liao

R 演習室

https://www.youtube.com/playlist?list=PL5AC0ADBF65924EAD

22.3.1 data input

https://www.youtube.com/watch?v=STcIxf_vUWY&list=PL5AC0ADBF65924EAD&index=1

scan()
read
- read.table()
- read.csv()

22.3.2 descriptive statistics

https://www.youtube.com/watch?v=GL3Wv_45LaU&list=PL5AC0ADBF65924EAD&index=2