# 1 R 資料結構

## 1.1 變數與基本的運算

3 + 4
3 - 4
3 * 4
3 / 4
15 %% 2  # get "1"
15 %/% 2  # get "7"

x <- 3 * 4
y <- 4 * 20
z <- x + y
z  # get "92"

GPA <- 4.3
class(GPA)  # get "'numeric'"

word <- "GPA"
class(word)  # get "'character'"

yes.no <- TRUE
class(yes.no)  # get "'logical'"

GTA <- as.integer(5)
class(GTA)  # get "'integer'"

complex.number <- 2 + 5i
class(complex.number)  # get "'complex'"

as.integer(TRUE)

2.35e7  # get "23500000"

Logical value 也可以進行運算，其中 TRUE 代表 1，而 FALSE 代表 0，如：

TRUE + TRUE  # get "2"
TRUE + FALSE  # get "1"
TRUE * FALSE  # get "0"
FALSE * FALSE  # get "0"

null.object <- NULL
null.object  # get "NULL"
class(null.object)  # get "'NULL'"

NA 的意思是 non-available，常常表示 missing data。但特別的是用 class() 檢視為 NA 的變數時會發現其為一 logical value。

missing <- NA
missing  # get "NA"
class(missing)  # get "'NA'"

NA 運算後也會產生 NA

1 + NA  # get "NA"
TRUE - NA  # get "NA"

## 1.2 向量

vector.numbers <- c(1, 2, 3, 4)
vector.numbers  # get "1 2 3 4"
length(vector.numbers)  # "4"
class(vector.numbers)  # get "'numeric'"

vector.characters <- c("R", "is", "cool")
vector.characters  # get "'R'    'is'   'cool'"
class(vector.characters)  # get "'character'"

vector.logicals <- c(TRUE, FALSE)
vector.logicals  # get "TRUE FALSE"
class(vector.logicals)  # get "'logical'"

c("I", "am", 1)  # get "'I'  'am' '1'"
class(c("I", "am", 1))  # get "'character'"

c("He", "is", "the", TRUE)  # get "'He'   'is'   'the'  'TRUE'"
class(c("He", "is", "the", TRUE))  # get "'character'"

c(1, FALSE)  # get "1 0"
class(c(1, FALSE))  # get "'numeric'"

c(NA, "1")  # get "NA  '1'"
class(c(NA, "1"))  # get "'character'"

c(NA, TRUE)  # get "NA  TRUE"
class(c(NA, TRUE))  # get "'logical'"

### 1.2.1 創造向量的其他方法

3:12  # get "3  4  5  6  7  8  9 10 11 12"
class(3:12)  # get "'numeric'"

rep(c(3, -1, 0.5), 3)  # get "3.0 -1.0  0.5  3.0 -1.0  0.5  3.0 -1.0  0.5"
rep(c(3, -1, 0.5), times=3)  # get "3.0 -1.0  0.5  3.0 -1.0  0.5  3.0 -1.0  0.5"
# 所以第二個 argument 預設是整串重複幾次的意思
# 但也能改成個別重複幾次，如下：
rep(c(3, -1, 0.5), each=3)  # get "3.0  3.0  3.0 -1.0 -1.0 -1.0  0.5  0.5  0.5"

rep(c(3, -1, 0.5), times=c(2, 1, 3)) # get "3.0  3.0 -1.0  0.5  0.5  0.5"

rep(c(3, -1, 0.5), length.out=8)  # get "3.0 -1.0  0.5  3.0 -1.0  0.5  3.0 -1.0"

seq(0, 10, 2.5)  # get "0.0  2.5  5.0  7.5 10.0"
seq(0, 10, 2.3)  # get "0.0 2.3 4.6 6.9 9.2"

### 1.2.2 標籤

temperatures <- c(28, 29, 27, 27, 30)
names(temperatures) <- c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday")
temperatures
##    Monday   Tuesday Wednesday  Thursday    Friday
##        28        29        27        27        30

# Method 1
rains <- c(0, 5, 6, 0, 2)
names(rains) <- names(temperatures)
rains
##    Monday   Tuesday Wednesday  Thursday    Friday
##         0         5         6         0         2
# Method 2
rains <- rep(NULL, 5)  # 先清空剛剛指派的 named num [1:5]
rains <- c(0, 5, 6, 0, 2)
days <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(rains) <- days
rains
##    Monday   Tuesday Wednesday  Thursday    Friday
##         0         5         6         0         2

### 1.2.3 向量的運算

# Kelvin degrees
Ktemp <- temperatures + 273.15
Ktemp
##    Monday   Tuesday Wednesday  Thursday    Friday
##    301.15    302.15    300.15    300.15    303.15
# Fahrenheit degrees
Ftemp <- temperatures * 1.8 + 32
Ftemp
##    Monday   Tuesday Wednesday  Thursday    Friday
##      82.4      84.2      80.6      80.6      86.0

total.rains <- sum(rains)
total.rains  # get "13"

### 1.2.4 存取向量中的元素

# 存取一個元素
rains[2]
## Tuesday
##       5
# 存取多個元素
rains[c(2, 5)]
## Tuesday  Friday
##       5       2
# 存取連續的元素
rains[2: 5]
##   Tuesday Wednesday  Thursday    Friday
##         5         6         0         2
# 等差地存取元素
rains[seq(1, 5, 2)]
##    Monday Wednesday    Friday
##         0         6         2

rains[c("Monday", "Tuesday")]
##  Monday Tuesday
##       0       5

### 1.2.5 關係

• <：小於。
• >：大於。
• <=：小於或等於。
• >=：大於或等於。
• ==：相等。
• !=：不相等。

4 > 3  # get "TRUE"
4 != 4  # get "FALSE"

rains > 0
##    Monday   Tuesday Wednesday  Thursday    Friday
##     FALSE      TRUE      TRUE     FALSE      TRUE

3 == 4 & 3 == 3  # get "FALSE"
3 == 4 | 3 == 3  # get "TRUE"
3 < 5 & 4 > 2  # get "TRUE"
TRUE & TRUE & FALSE  # get "FALSE"
xor(3 > 2, FALSE)  # get "TRUE"
xor(3 > 2, TRUE)  # get "FALSE"
xor(3 < 2, FALSE)  # get "FALSE"

# 沒下雨的日子
not.rainy.days <- rains == 0
not.rainy.days
##    Monday   Tuesday Wednesday  Thursday    Friday
##      TRUE     FALSE     FALSE      TRUE     FALSE
# 炎熱的日子
hot.days <- temperatures >= 29
hot.days
##    Monday   Tuesday Wednesday  Thursday    Friday
##     FALSE      TRUE     FALSE     FALSE      TRUE

# 在 hot.days 中，Tues. 與 Fri. 是真
# 所以 rains 中 Tues. 與 Fri. 標籤的位置的標籤與值會被叫出來
rains[hot.days]
## Tuesday  Friday
##       5       2

all(temperatures >= 28)  # get "FALSE"
any(temperatures >= 30)  # get "TRUE"

which(temperatures == 27)
## Wednesday  Thursday
##         3         4

### 1.2.6 排序

some.vector <- c(3, 7, 9, 6, 2, 8)
order(some.vector)
## [1] 5 1 4 2 6 3

some.vector[order(some.vector)]
## [1] 2 3 6 7 8 9

some.vector <- c(3, 7, 9, 6, 2, 8)
sort(some.vector)
## [1] 2 3 6 7 8 9

### 1.2.7 其他操作

sum(some.vector)  # get "35"
max(some.vector)  # get "9"
min(some.vector)  # get "2"
range(some.vector)  # get "2 9"
mean(some.vector)  # get "5.833333"

temperatures
##    Monday   Tuesday Wednesday  Thursday    Friday
##        28        29        27        27        30
which(temperatures == min(temperatures))
## Wednesday  Thursday
##         3         4
max(temperatures)
## [1] 30

## 1.3 矩陣

### 1.3.1 創建矩陣

matrix(data = NA, nrow = 1, ncol = 1, byrow = FALSE, dimnames = NULL)

matrix(1:6, nrow = 2, byrow = TRUE)
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
matrix(1:6, nrow = 2, byrow = FALSE)  # default argument
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6

matrix(1:11, nrow = 3)
## Warning in matrix(1:11, nrow = 3): data length [11] is not a sub-multiple or
## multiple of the number of rows [3]
##      [,1] [,2] [,3] [,4]
## [1,]    1    4    7   10
## [2,]    2    5    8   11
## [3,]    3    6    9    1

matrix(c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"), nrow = 4)
##      [,1]       [,2]     [,3]
## [1,] "January"  "May"    "September"
## [2,] "February" "June"   "October"
## [3,] "March"    "July"   "November"
## [4,] "April"    "August" "December"
matrix(c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE), nrow = 2)
##       [,1] [,2]  [,3]
## [1,]  TRUE TRUE FALSE
## [2,] FALSE TRUE FALSE

climate <- matrix(c(temperatures, rains), byrow = TRUE, nrow = 2)
rownames(climate) <- c("Temperatures", "Rains")
colnames(climate) <- days
climate
##              Monday Tuesday Wednesday Thursday Friday
## Temperatures     28      29        27       27     30
## Rains             0       5         6        0      2

dim(climate)  # get "2 5"

### 1.3.2 合併矩陣與矩陣運算

Winds <- c(30, 25, 22, 24, 18)
total.climate <- rbind(climate, Winds)
total.climate
##              Monday Tuesday Wednesday Thursday Friday
## Temperatures     28      29        27       27     30
## Rains             0       5         6        0      2
## Winds            30      25        22       24     18

totals <- rowSums(total.climate)
cbind(total.climate, totals)
##              Monday Tuesday Wednesday Thursday Friday totals
## Temperatures     28      29        27       27     30    141
## Rains             0       5         6        0      2     13
## Winds            30      25        22       24     18    119

### 1.3.3 存取矩陣中的元素

total.climate[2, 3]  # get "6"

total.climate[, 3]
## Temperatures        Rains        Winds
##           27            6           22

total.climate[2, ]
##    Monday   Tuesday Wednesday  Thursday    Friday
##         0         5         6         0         2

total.climate[, "Wednesday"]
## Temperatures        Rains        Winds
##           27            6           22
total.climate["Rains",]
##    Monday   Tuesday Wednesday  Thursday    Friday
##         0         5         6         0         2
total.climate["Rains", "Wednesday"]
## [1] 6

### 1.3.4 矩陣的運算

mean(total.climate["Temperatures", ])
## [1] 28.2

climate * 2
##              Monday Tuesday Wednesday Thursday Friday
## Temperatures     56      58        54       54     60
## Rains             0      10        12        0      4
climate ^ 2
##              Monday Tuesday Wednesday Thursday Friday
## Temperatures    784     841       729      729    900
## Rains             0      25        36        0      4
# 計算雨量與氣溫的比例
climate[2, ] / climate[1, ]
##     Monday    Tuesday  Wednesday   Thursday     Friday
## 0.00000000 0.17241379 0.22222222 0.00000000 0.06666667

## 1.4 Factors

sizes <- c("Small", "Big", "Big", "Medium", "Medium", "Small", "Medium", "Small", "Small")
sizes
## [1] "Small"  "Big"    "Big"    "Medium" "Medium" "Small"  "Medium" "Small"
## [9] "Small"

summary(sizes)
##    Length     Class      Mode
##         9 character character

factor.sizes <- factor(sizes)
factor.sizes
## [1] Small  Big    Big    Medium Medium Small  Medium Small  Small
## Levels: Big Medium Small
summary(factor.sizes)
##    Big Medium  Small
##      2      3      4
levels(factor.sizes)
## [1] "Big"    "Medium" "Small"

sizes2 <- c("Small", "Big", "Big", "Medium", "Medium", "Small", "Medium", "Small", "Small")
factor.sizes2 <- factor(sizes, ordered=TRUE, levels=c("Small", "Medium", "Big"))
factor.sizes2
## [1] Small  Big    Big    Medium Medium Small  Medium Small  Small
## Levels: Small < Medium < Big

survey.vector <- c("M", "F", "F", "M", "M", "F", "M", "M")
factor.survey.vector <- factor(survey.vector)
# 此時儲存的順序是 "F M"，因此可以 c("Female", "Male") 覆寫之
levels(factor.survey.vector) <- c("Female", "Male")
factor.survey.vector
## [1] Male   Female Female Male   Male   Female Male   Male
## Levels: Female Male

factor.sizes2[1] < factor.sizes2[2]  # get "TRUE"
factor.sizes2[2] == factor.sizes2[2]  # get "TRUE"
factor.sizes2[2] != factor.sizes2[3]  # get "FALSE"

## 1.5 Data Frames

?datasets
library(help="datasets")

str(OrchardSprays)
## 'data.frame':    64 obs. of  4 variables:
##  $decrease : num 57 95 8 69 92 90 15 2 84 6 ... ##$ rowpos   : num  1 2 3 4 5 6 7 8 1 2 ...
##  $colpos : num 1 1 1 1 1 1 1 1 2 2 ... ##$ treatment: Factor w/ 8 levels "A","B","C","D",..: 4 5 2 8 7 6 3 1 3 2 ...

name <- c("Alfonso", "Carlos", "Lluis", "Diego")
second.last.name <- c("Saiz", "Gonzalez", "Gil", "Ruiz")
age <- c(33, 32, 30, 37)
phd <- c("math", "math", "physics", "math")
office <- c(4, 14, 6, 8)
from.madrid <- c(FALSE, TRUE, FALSE, TRUE)
professors <- data.frame(name, last.name, second.last.name, age, phd, office, from.madrid)
str(professors)
## 'data.frame':    4 obs. of  7 variables:
##  $name : chr "Alfonso" "Carlos" "Lluis" "Diego" ##$ last.name       : chr  "Zamora" "Quesada" "Hurtado" "Mondejar"
##  $second.last.name: chr "Saiz" "Gonzalez" "Gil" "Ruiz" ##$ age             : num  33 32 30 37
##  $phd : chr "math" "math" "physics" "math" ##$ office          : num  4 14 6 8
##  $from.madrid : logi FALSE TRUE FALSE TRUE ### 1.5.1 存取 data frames 中的元素與 subset 想要選取 data frames 中的元素，就像選取矩陣中的元素一樣，即 dataframes[row, col]，如： professors[2, 3] professors[1, ] # 顯示第一個 row professors[, 2] # 顯示第二個 column professors[1 : 2, ] # 選擇頭兩個 rows 要注意的是，data frames 的 column 的名字是來自 column vector 的名字，所以我們也可以用$ 來存取特定的 column vector，如：

professors$phd ## [1] "math" "math" "physics" "math" professors$phd[3]
## [1] "physics"

madrileans <- professors$from.madrid professors[madrileans, ] ## name last.name second.last.name age phd office from.madrid ## 2 Carlos Quesada Gonzalez 32 math 14 TRUE ## 4 Diego Mondejar Ruiz 37 math 8 TRUE 以指令 subset(data_frame, subset=logical_condition) 也能做到類似的事，如： subset(professors, subset=age > 31) ## name last.name second.last.name age phd office from.madrid ## 1 Alfonso Zamora Saiz 33 math 4 FALSE ## 2 Carlos Quesada Gonzalez 32 math 14 TRUE ## 4 Diego Mondejar Ruiz 37 math 8 TRUE subset(professors, subset=phd == "math") ## name last.name second.last.name age phd office from.madrid ## 1 Alfonso Zamora Saiz 33 math 4 FALSE ## 2 Carlos Quesada Gonzalez 32 math 14 TRUE ## 4 Diego Mondejar Ruiz 37 math 8 TRUE ### 1.5.2 排序 想要以某個特定的變數的順序整理 data frames，可以使用 order()，如： positions <- order(professors$age)
professors[positions, ]
##      name last.name second.last.name age     phd office from.madrid
## 3   Lluis   Hurtado              Gil  30 physics      6       FALSE
## 2  Carlos   Quesada         Gonzalez  32    math     14        TRUE
## 1 Alfonso    Zamora             Saiz  33    math      4       FALSE
## 4   Diego  Mondejar             Ruiz  37    math      8        TRUE

## 1.6 Lists

Lists 就像向量，但其元素可以有不同的長度、大小、型態。我們可以用 list() 創建 lists，如：

new.list <- list(days, factor.sizes, climate)
new.list
## [[1]]
## [1] "Monday"    "Tuesday"   "Wednesday" "Thursday"  "Friday"
##
## [[2]]
## [1] Small  Big    Big    Medium Medium Small  Medium Small  Small
## Levels: Big Medium Small
##
## [[3]]
##              Monday Tuesday Wednesday Thursday Friday
## Temperatures     28      29        27       27     30
## Rains             0       5         6        0      2

new.list[1]  # 第一個元素
new.list[[1]][3]  # 第一個元素中的第三個元素
new.list[[3]][1, 2:5]  # 第三個元素中的第一個 row 的第二至五個的元素

new.list <- list(the.days=days, the.factors=factor.sizes, the.data=climate)
new.list
## $the.days ## [1] "Monday" "Tuesday" "Wednesday" "Thursday" "Friday" ## ##$the.factors
## [1] Small  Big    Big    Medium Medium Small  Medium Small  Small
## Levels: Big Medium Small
##
## $the.data ## Monday Tuesday Wednesday Thursday Friday ## Temperatures 28 29 27 27 30 ## Rains 0 5 6 0 2 這樣就可以用$名字 來存取 list 中的元素，如：

new.list$the.factors new.list$the.data[2, 5]
new.list["the.data"]

new.list[["professors"]] <- professors  # 新增第四個元素，名為 "professors" 的 data frames
new.list[[""]] <- positions  # 新增第五個元素，無名的向量
new.list[[5]] <- NULL  # 刪除第五個元素
new.list[["professors"]] <- NULL  # 刪除名為 "professors" 的元素

str(new.list)

### 參考文獻

Zamora Saiz, Alfonso, Carlos Quesada González, Lluís Hurtado Gil, and Diego Mondéjar Ruiz. 2020. An Introduction to Data Analysis in R: Hands-on Coding, Data Mining, Visualization and Statistics from Scratch. Use R! Cham: Springer International Publishing. https://doi.org/10.1007/978-3-030-48997-7.

1. characteras.integer 的話將會產生 NA，傳 TRUEFALSE 的話會變成整數 1 或 0，而傳 complex 的話則虛數的部份將會被自動捨棄。↩︎