## 4.2 stringr

### 4.2.1 安装

# Install the released version from CRAN:
install.packages("stringr")

# Install the cutting edge development version from GitHub:
# install.packages("devtools")
devtools::install_github("tidyverse/stringr")

### 4.2.2 基本使用

stringr包中所有的函数都已str_开头，让待处理字符做第一个参数，这样处理好处明显，方便使用以及记忆。

• 字符串长度
library(stringr)
char <- "我是R语言学习者"
str_length(char)
#> [1] 8
# 向量化
str_length(c("a", "R for data science", NA))
#> [1]  1 18 NA
• 连接字符串

R中字符串不像python中可以用加号连接字符串,如下所示:

R 版本

#base R
paste0('a','b')
#> [1] "ab"

#stringr
str_c("a","b")
#> [1] "ab"
str_c("a", "b", sep = ", ") #sep 参数控制分隔符
#> [1] "a, b"

Python 版本

'a' + 'b'
#> 'ab'

#base R
paste0(c('a','b','d','e'),collapse = ',')
#> [1] "a,b,d,e"
#stringr
str_c(c('a','b','d','e'),collapse = ',')  #collapse 参数控制
#> [1] "a,b,d,e"
• 移除

str_remove(string = 'a||b',pattern = "\\|\\|")
#> [1] "ab"

### 4.2.3 常用函数

#### 4.2.3.1 截取字符

Excleleft,mid,right函数功能类似

str_sub() 函数 三个参数:

string:需要被截取的字符串

start: 默认1L,即从最开始截取

end:默认-1L,即截取到最后

#注意end 3 和 -3的区别
str_sub(string = '我是R语言学习者',start = 2,end = 3)
#> [1] "是R"
str_sub(string = '我是R语言学习者',start = 2,end = -3)
#> [1] "是R语言学"

#### 4.2.3.2 匹配字符

str_match(string, pattern)
str_match_all(string, pattern)
str_extract(string, pattern)
str_extract_all(string, pattern, simplify = FALSE)

str_extract()函数返回向量,str_match()函数返回矩阵.

# 测试文字来源烽火戏诸侯的<剑来>
strings <- c('陈平安放下新折的那根桃枝,吹灭蜡烛,走出屋子后,坐在台阶上,仰头望去,星空璀璨.')
str_extract(strings,'陈平安')
#> [1] "陈平安"
str_match(strings,'陈平安')
#>      [,1]
#> [1,] "陈平安"
• 匹配中文

str_extract_all(strings,'[\u4e00-\u9fa5]') #返回list
#> [[1]]
#>  [1] "陈" "平" "安" "放" "下" "新" "折" "的" "那" "根" "桃" "枝" "吹" "灭" "蜡"
#> [16] "烛" "走" "出" "屋" "子" "后" "坐" "在" "台" "阶" "上" "仰" "头" "望" "去"
#> [31] "星" "空" "璀" "璨"
• 匹配数字或英文

strings <- c('00123545','LOL league of legends')
str_extract_all(strings,'[0-9]')
#> [[1]]
#> [1] "0" "0" "1" "2" "3" "5" "4" "5"
#>
#> [[2]]
#> character(0)
str_extract_all(strings,'[a-zA-Z]')
#> [[1]]
#> character(0)
#>
#> [[2]]
#>  [1] "L" "O" "L" "l" "e" "a" "g" "u" "e" "o" "f" "l" "e" "g" "e" "n" "d" "s"

#### 4.2.3.3 添加字符

str_pad(string = 1:12,width = 2,side = 'left',pad = '0')
#>  [1] "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" "11" "12"

#### 4.2.3.4 去除空格

exceltrim函数功能类似，剔除字符中的空格，但是不可以剔除字符中的空格

# side 可选 both  left right
str_trim(' ab af ',side = 'both')
#> [1] "ab af"

#### 4.2.3.5 分割字符

str_split()处理后的结果是列表

# 得到列表,需要向量化
str_split("a,b,d,e",pattern = ',')
#> [[1]]
#> [1] "a" "b" "d" "e"

str_split('ab||cd','\\|\\|') %>% unlist()
#> [1] "ab" "cd"
# same above
#str_split('ab||cd','\\|\\|') %>% purrr::as_vector()

fruits <- c(
"apples and oranges and pears and bananas",
"pineapples and mangos and guavas"
)

str_split(fruits, " and ")
#> [[1]]
#> [1] "apples"  "oranges" "pears"   "bananas"
#>
#> [[2]]
#> [1] "pineapples" "mangos"     "guavas"

#### 4.2.3.6 替换字符

str_replace()str_replace_all()函数用来替换字符

fruits <- c("one apple", "two pears", "three bananas")
str_replace(fruits, "[aeiou]", "-")
#> [1] "-ne apple"     "tw- pears"     "thr-e bananas"
str_replace_all(fruits, "[aeiou]", "-")
#> [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

#### 4.2.3.7 移除字符

str_remove(),str_remove_all()移除字符。本人常用该函数剔除文本中的空格。

fruits <- c("one apple", "two pears", "three bananas")
str_remove(fruits, "[aeiou]")
#> [1] "ne apple"     "tw pears"     "thre bananas"
str_remove_all(fruits, "[aeiou]")
#> [1] "n ppl"    "tw prs"   "thr bnns"

str_replace_all(string = ' d a  b ',pattern = ' ',replacement = '')
#> [1] "dab"

#### 4.2.3.8 字符排序

numeric 参数决定是否按照数字排序。

str_order(x, decreasing = FALSE, na_last = TRUE, locale = "en",
numeric = FALSE, ...)

str_sort(x, decreasing = FALSE, na_last = TRUE, locale = "en",
numeric = FALSE, ...)
str_order(letters)
#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
#> [26] 26
str_sort(letters)
#>  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s"
#> [20] "t" "u" "v" "w" "x" "y" "z"

numeric参数

x <- c("100a10", "100a5", "2b", "2a")
str_sort(x)
#> [1] "100a10" "100a5"  "2a"     "2b"
str_sort(x, numeric = TRUE)
#> [1] "2a"     "2b"     "100a5"  "100a10"

#### 4.2.3.9 提取单词

• 参数
word(string, start = 1L, end = start, sep = fixed(" "))
• 案例
sentences <- c("Jane saw a cat", "Jane sat down")
word(sentences, 2, -1)
#> [1] "saw a cat" "sat down"
word(sentences[1], 1:3, -1)
#> [1] "Jane saw a cat" "saw a cat"      "a cat"

# Can define words by other separators
str <- 'abc.def..123.4568.999'
word(str, 1, sep = fixed('..'))
#> [1] "abc.def"
word(str, 2, sep = fixed('..'))
#> [1] "123.4568.999"

#### 4.2.3.10 其他函数

• str_subset str_which

str_subset()是对x[str_detect(x,pattern)]的包装，str_which()是which(str_detect(x,pattern))的包装。

fruit <- c("apple", "banana", "pear", "pinapple")
str_subset(fruit, "a")
#> [1] "apple"    "banana"   "pear"     "pinapple"
# 匹配字符首次出现的位置
str_which(fruit, "a")
#> [1] 1 2 3 4

#筛选出字母行
set.seed(24)
dt <- data.table::data.table(col=sample(c(letters,1:10),100,replace = T))
head(dt[str_which(col,pattern = '[a-z]')])
• str_dup()

str_dup()功能是复制字符串。

fruit <- c("apple", "pear", "banana")
str_dup(fruit, 2)
str_dup(fruit, 1:3)
str_c("ba", str_dup("na", 0:5))
• str_starts() str_ends()

str_starts('abd','a')
#> [1] TRUE
str_detect('abd','^a')
#> [1] TRUE

str_ends('abd','d')
#> [1] TRUE
str_detect('abd','d\$')
#> [1] TRUE
• 大小写转换

str_to_upper()函数将全部字符转换为大写，str_to_lower()函数将全部字符转换为小写，str_to_title()将每个单词的首字母转换为大写，str_to_sentence()将一个字符的首字母转换为大写。

dog <- "The quick brown dog"
str_to_upper(dog)
#> [1] "THE QUICK BROWN DOG"
str_to_lower(dog)
#> [1] "the quick brown dog"
str_to_title(dog)
#> [1] "The Quick Brown Dog"
str_to_sentence("the quick brown dog")
#> [1] "The quick brown dog"