1 R 数据结构
1.1 向量
R 里面的向量看起来像 Python 的 list,但又不是 list,更像是Python 里一维的数组。
向量入门
基础
vec <- c(1, 2, 3, 6, 5, 4)
vec[c(1, 2, 4)]
#> [1] 1 2 6
class(vec)
#> [1] "numeric"
vec[1:2] # 末尾包含
#> [1] 1 2
Python 代码,除了用列表推导式之外还可以直接用 Numpy 实现
import numpy as np
a= np.array([1, 2, 3, 6, 5, 4])
print a[[0, 1, 3]]
print type(a)
#> [1 2 6]
#> <type 'numpy.ndarray'>
创建向量 类 Python range
a <- seq(10)
a
#> [1] 1 2 3 4 5 6 7 8 9 10
b <- seq(10, 13)
b
#> [1] 10 11 12 13
temp <- c(1, 2, 4, 0)
temp * b
#> [1] 10 22 48 0
生成有规律的向量, 类 np.linspace
vec <- seq(1, 100, length.out = 10) # 还有个long参数也非常有用。
vec
#> [1] 1 12 23 34 45 56 67 78 89 100
vec[-4] #这点跟 Python 很不一样
#> [1] 1 12 23 45 56 67 78 89 100
seq(from=2, to=1000, length.out = 10)
#> [1] 2 113 224 335 446 556 667 778 889 1000
seq(from=2, to=1000, length=10)
#> [1] 2 113 224 335 446 556 667 778 889 1000
group1 <- rep(1:3, times = c(8, 10, 9))
group2 <- factor(group1) # 转换成因子
group1
#> [1] 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3
class(group1)
#> [1] "integer"
class(group2)
#> [1] "factor"
length(group1) # as Py len
#> [1] 27
随机向量
vec_random1 <- runif(5) # 0-1
vec_random1
#> [1] 0.0808 0.8343 0.6008 0.1572 0.0074
vec_random2 <- sample(c('A', 'B'), size = 10, replace = TRUE) # 随机字符向量
vec_random2
#> [1] "A" "A" "A" "B" "B" "B" "A" "A" "A" "A"
vector1 <- numeric(10)
vector1 # empty vector
#> [1] 0 0 0 0 0 0 0 0 0 0
逻辑判断
temp[1:3]
#> [1] 1 2 4
temp[c(TRUE, TRUE, FALSE, FALSE)]
#> [1] 1 2
temp[temp > 1]
#> [1] 2 4
1.2 矩阵
矩阵是二维的容器,可包含数值、逻辑、字符等数据
生成矩阵
vector <- 1:12
class(vector)
#> [1] "integer"
my_matrix <- matrix(vector, nrow = 3, ncol = 4, byrow = FALSE)
dim(my_matrix)
#> [1] 3 4
vector
#> [1] 1 2 3 4 5 6 7 8 9 10 11 12
my_matrix
#> [,1] [,2] [,3] [,4]
#> [1,] 1 4 7 10
#> [2,] 2 5 8 11
#> [3,] 3 6 9 12
vector1 <- vector2 <- vector3 <- runif(3)
my_matrix <- cbind(vector1, vector2, vector3) # 一致的向量类型进行合并
my_matrix
#> vector1 vector2 vector3
#> [1,] 0.1957 0.1957 0.1957
#> [2,] 0.4035 0.4035 0.4035
#> [3,] 0.0637 0.0637 0.0637
round(my_matrix*10, digits = 2) # 取2位
#> vector1 vector2 vector3
#> [1,] 1.96 1.96 1.96
#> [2,] 4.04 4.04 4.04
#> [3,] 0.64 0.64 0.64
取子集和计算
my_mat <- matrix(c(8, 3, 4, 1, 5, 9, 6, 7, 2), ncol = 3)
print(my_mat)
#> [,1] [,2] [,3]
#> [1,] 8 1 6
#> [2,] 3 5 7
#> [3,] 4 9 2
my_mat[1,1] + my_mat[1, 2] + my_mat[1,3]
#> [1] 15
sum(my_mat[1,])
#> [1] 15
rowSums(my_mat)
#> [1] 15 15 15
colSums(my_mat)
#> [1] 15 15 15
sum(diag(my_mat))
#> [1] 15
class(my_mat[1,]) # 退化成向量了
#> [1] "numeric"
my_mat[1, , drop=FALSE] # 保留矩阵属性
#> [,1] [,2] [,3]
#> [1,] 8 1 6
my_mat[my_mat <= 5] <- 0 # 改变赋值
my_mat
#> [,1] [,2] [,3]
#> [1,] 8 0 6
#> [2,] 0 0 7
#> [3,] 0 9 0
# ifelse 函数
my_mat <- matrix(c(8, 3, 4, 1, 5, 9, 6, 7, 2), ncol = 3)
ifelse(my_mat > 0.5, 1, 0)
#> [,1] [,2] [,3]
#> [1,] 1 1 1
#> [2,] 1 1 1
#> [3,] 1 1 1
1.3 数据框
优点在于不同向量的数据类型可以不一样,但各列的长度必须一致
city <- c('A', 'B', 'C', 'D')
temp <- c(27, 29, 23, 14)
data <- data.frame(city, temp) # 对于 list,Py 不能直接这么导,会把整数向量变为索引,得用 dir 的方式
data
#> city temp
#> 1 A 27
#> 2 B 29
#> 3 C 23
#> 4 D 14
data[, 1] # Py data.ix[:,0] | data.iloc[:,0]
#> [1] A B C D
#> Levels: A B C D
data$temp
#> [1] 27 29 23 14
class(data$city) # 本是字符,自动转了因子,若不想转,课用 stringAsFactors = FALSE 设定
#> [1] "factor"
组合索引
data[data$temp > mean(data$temp), ]
#> city temp
#> 1 A 27
#> 2 B 29
# data['temp', ] # empty,这是提取行的语法
data[, 'city'] # 提取列
#> [1] A B C D
#> Levels: A B C D
data$temp > mean(data$temp)
#> [1] TRUE TRUE FALSE FALSE
with(data, data[temp > mean(temp), ]) # 直接操作列名,省略 $
#> city temp
#> 1 A 27
#> 2 B 29
with(data, data[temp > mean(temp), 'city'])
#> [1] A B
#> Levels: A B C D
熟悉数据框最快的办法
summary(data) # as Py describe
#> city temp
#> A:1 Min. :14.0
#> B:1 1st Qu.:20.8
#> C:1 Median :25.0
#> D:1 Mean :23.2
#> 3rd Qu.:27.5
#> Max. :29.0
dim(data) # as Py data.shape
#> [1] 4 2
head(data) # 看前6行
#> city temp
#> 1 A 27
#> 2 B 29
#> 3 C 23
#> 4 D 14
#data.head(1) # 不能这么干
head(data, n = 1L)
#> city temp
#> 1 A 27
str(data) # 返回数据结构
#> 'data.frame': 4 obs. of 2 variables:
#> $ city: Factor w/ 4 levels "A","B","C","D": 1 2 3 4
#> $ temp: num 27 29 23 14
数据框的排序
order(data$temp) # 返回数据的索引号
#> [1] 4 3 1 2
data[order(data$temp), ]
#> city temp
#> 4 D 14
#> 3 C 23
#> 1 A 27
#> 2 B 29
data[order(data$temp, decreasing = T), ][1:2, ] # 反序
#> city temp
#> 2 B 29
#> 1 A 27
1.4 列表
data_list <- list(temp = temp, city = city)
print(data_list)
#> $temp
#> [1] 27 29 23 14
#>
#> $city
#> [1] "A" "B" "C" "D"
data_list$mat <- my_mat
data_list$data <- data
names(data_list)
#> [1] "temp" "city" "mat" "data"
class(data_list[3])
#> [1] "list"
data_list[3]
#> $mat
#> [,1] [,2] [,3]
#> [1,] 8 1 6
#> [2,] 3 5 7
#> [3,] 4 9 2
data_list[[3]] # 没有 name
#> [,1] [,2] [,3]
#> [1,] 8 1 6
#> [2,] 3 5 7
#> [3,] 4 9 2
class(data_list[[3]])
#> [1] "matrix"
1.5 特殊对象
缺失值和空值
temp <- c(27, 29, 23, 14, NA)
mean(temp)
#> [1] NA
mean(temp, na.rm = TRUE)
#> [1] 23.2
temp <- c(27, 29, 23, 14, NULL)
data_list <- NULL # 快速删除一列
mean(temp) # TRUE
#> [1] 23.2
连接
textcon = textConnection('output', 'w')
sink(textcon) # 开
x <- runif(10)
summary(x)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.051 0.315 0.604 0.557 0.740 0.981
print('这话不显示了,写入了 output 对象了')
#> [1] "这话不显示了,写入了 output 对象了"
sink() # 关闭控制台转换
print(output)
#> character(0)
showConnections()
#> description class mode text isopen can read can write
#> 4 "output" "textConnection" "wr" "text" "opened" "no" "yes"
#> 5 "output" "textConnection" "w" "text" "opened" "no" "yes"
class(textcon)
#> [1] "textConnection" "connection"
close(textcon)
公式
n <- 1:50
xvar <- paste0('x', n )
right <- paste(xvar, collapse = ' + ')
left <- 'y~'
my_formula <- paste(left, right)
my_formula <- as.formula(my_formula)
class(my_formula)
#> [1] "formula"
left
#> [1] "y~"
表达式
ex <- expression(x <- seq(1, 10, 2))
print(ex)
#> expression(x <- seq(1, 10, 2))
class(ex)
#> [1] "expression"
eval(ex)
print(x)
#> [1] 1 3 5 7 9
tex <- c('z<-seq(1, 10, 2)', 'print(z)' )
eval(parse(text = tex))
#> [1] 1 3 5 7 9
环境变量
ls()
#> [1] "a" "b" "city" "data" "data_list"
#> [6] "ex" "group1" "group2" "left" "my_formula"
#> [11] "my_mat" "my_matrix" "n" "output" "right"
#> [16] "temp" "tex" "textcon" "vec" "vec_random1"
#> [21] "vec_random2" "vector" "vector1" "vector2" "vector3"
#> [26] "x" "xvar" "z"
env1 <- new.env()
assign("x1", 1:5, envir = env1)
ls(envir = env1)
#> [1] "x1"
get('x1', envir = env1) # 为啥没法直接用 x1 取呢?
#> [1] 1 2 3 4 5
exists('x1', envir = env1)
#> [1] TRUE
rm('x1', envir = env1)
函数
exp(1)
#> [1] 2.72
myfunc <- function(r){
area <- pi*r^2
return(area)
} # 内部可以调用 global value
print(myfunc(4))
#> [1] 50.3
myfunc
#> function(r){
#> area <- pi*r^2
#> return(area)
#> }