# 第 9 章 正则表达式

library(tidyverse)
library(stringr)

## 9.1 问题

## # A tibble: 8 x 2
##   <int> <chr>
## 1     1 Sichuan Univ, Coll Chem
## 2     2 Sichuan Univ, Coll Elect Engn
## 3     3 Sichuan Univ, Dept Phys
## 4     4 Sichuan Univ, Coll Life Sci
## 5     6 Sichuan Univ, Food Engn
## 6     7 Sichuan Univ, Coll Phys
## 7     8 Sichuan Univ, Sch Business
## 8     9 Wuhan Univ, Mat Sci

## 9.2 什么是正则表达式

• 条件搜索
• 统计文中，前面有 “data”, “computer” or “statistical” 的 “analysis”，这个单词的个数
• 找出文中重复的单词，比如“we love love you”
• 拼写检查
• 电话号码（邮件，密码等）是否正确格式
• 日期书写的规范与统一
• 提取信息
• 提取文本特定位置的数据
• 文本挖掘
• 非结构化的提取成结构化

R 语言中很多函数都需要使用正则表达式，然而正则表达式不太好学。幸运的是，大神Hadley Wickham开发的stringr包让正则表达式简单易懂，因此今天我们就介绍这个包。本章的内容与《R for data science》第10章基本一致。本章目的教大家写简单的正则表示式就行了。

## 9.3 字符串基础

### 9.3.1 字符串长度

str_length("R for data science")
## [1] 18

str_length(c("a", "R for data science", NA))
## [1]  1 18 NA

data.frame(
x = c("a", "R for data science", NA)
) %>%
mutate(y = str_length(x))
##                    x  y
## 1                  a  1
## 2 R for data science 18
## 3               <NA> NA

### 9.3.2 字符串组合

str_c("x", "y")
## [1] "xy"

str_c("x", "y", sep = ", ")
## [1] "x, y"
str_c(c("x", "y", "z"), sep = ", ")
## [1] "x" "y" "z"

str_c(c("x", "y", "z"), c("x", "y", "z"), sep = ", ")
## [1] "x, x" "y, y" "z, z"

data.frame(
x = c("I", "love", "you"),
y = c("you", "like", "me")
) %>%
mutate(z = str_c(x, y, sep = "|"))
##      x    y         z
## 1    I  you     I|you
## 2 love like love|like
## 3  you   me    you|me

str_c(c("x", "y", "z"), c("a", "b", "c"), sep = "|")
## [1] "x|a" "y|b" "z|c"
str_c(c("x", "y", "z"), c("a", "b", "c"), collapse = "|")
## [1] "xa|yb|zc"

### 9.3.3 字符串取子集

x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"

x <- c("Apple", "Banana", "Pear")
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"

x <- c("Apple", "Banana", "Pear")
x
## [1] "Apple"  "Banana" "Pear"
str_sub(x, 1, 1)
## [1] "A" "B" "P"
str_sub(x, 1, 1) <- "Q"
x
## [1] "Qpple"  "Qanana" "Qear"

## 9.4 使用正则表达式进行模式匹配

### 9.4.1 基础匹配

str_view() 是查看string是否匹配pattern，如果匹配，就高亮显示

x <- c("apple", "banana", "pear")
str_view(string = x, pattern = "an")

x <- c("apple", "banana", "pear")
str_view(x, ".a.")

c("s.d") %>%
str_view(".")
c("s.d") %>%
str_view("\\.")

### 9.4.2 锚点

x <- c("apple", "banana", "pear")
str_view(x, "^a")

x <- c("apple", "banana", "pear")
str_view(x, "a$") x <- c("apple pie", "apple", "apple cake") str_view(x, "^apple$")

### 9.4.3 字符类与字符选项

• \d: matches any digit.
• \s: matches any whitespace (e.g. space, tab, newline).
• [abc]: matches a, b, or c.
• [^abc]: matches anything except a, b, or c.
str_view(c("grey", "gray"), "gr[ea]y")

### 9.4.4 重复

• ?: 0 or 1
• +: 1 or more
• *: 0 or more
x <- "Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
str_view(x, "X+")

• {n}: exactly n
• {n,}: n or more
• {,m}: at most m
• {n,m}: between n and m
x <- "Roman numerals: MDCCCLXXXVIII"
str_view(x, "C{2}")
str_view(x, "C{2,}")
str_view(x, "C{2,3}")
• 默认的情况，*, + 匹配都是贪婪的，也就是它会尽可能的匹配更多
• 如果想让它不贪婪，而是变得懒惰起来，可以在*, + 加个?
x <- "Roman numerals: MDCCCLXXXVIII"

str_view(x, "CLX+")
str_view(x, "CLX+?")

### 9.4.5 分组与回溯引用

ft <- fruit %>% head(10)
ft
##  [1] "apple"        "apricot"      "avocado"
##  [4] "banana"       "bell pepper"  "bilberry"
##  [7] "blackberry"   "blackcurrant" "blood orange"
## [10] "blueberry"

str_view(ft, ".{2}", match = TRUE)

str_view(ft, "(.)\\1", match = TRUE)
• . 是匹配任何字符
• (.) 将匹配项括起来，它就用了一个名字，叫\\1； 如果有两个括号，就叫\\1\\2
• \\1 表示回溯引用，表示引用\\1对于的(.)

str_view(ft, "(..)\\1", match = TRUE)

str_view(ft, "(.)(.)\\2\\1", match = TRUE)

## 9.5 解决实际问题

### 9.5.1 确定一个字符向量是否匹配一种模式

x <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1]  TRUE FALSE  TRUE

## # A tibble: 3 x 1
##   x
##   <chr>
## 1 apple
## 2 banana
## 3 pear
d %>% mutate(has_e = str_detect(x, "e"))
## # A tibble: 3 x 2
##   x      has_e
##   <chr>  <lgl>
## 1 apple  TRUE
## 2 banana FALSE
## 3 pear   TRUE

d %>% dplyr::filter(str_detect(x, "e"))
## # A tibble: 2 x 1
##   x
##   <chr>
## 1 apple
## 2 pear

stringr::words包含了牛津字典里常用单词

stringr::words %>% head()
## [1] "a"        "able"     "about"    "absolute"
## [5] "accept"   "account"

# How many common words start with t?
sum(str_detect(words, "^t"))
## [1] 65

# proportion of common words end with a vowel?
mean(str_detect(words, "[aeiou]$")) ## [1] 0.2765 放在数据框里看看, 看看以x结尾的单词是哪些？ tibble( word = words ) %>% dplyr::filter(str_detect(word, "x$"))
## # A tibble: 4 x 1
##   word
##   <chr>
## 1 box
## 2 sex
## 3 six
## 4 tax

str_detect() 有一个功能类似的函数str_count()，区别在于，后者不是简单地返回是或否，而是返回字符串中匹配的数量

x <- c("apple", "banana", "pear")
str_count(x, "a")
## [1] 1 3 1
tibble(
word = words
) %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[^aeiou]")
)
## # A tibble: 980 x 3
##    word     vowels consonants
##    <chr>     <int>      <int>
##  1 a             1          0
##  2 able          2          2
##  4 absolute      4          4
##  5 accept        2          4
##  6 account       3          4
##  7 achieve       4          3
##  8 across        2          4
##  9 act           1          2
## 10 active        3          3
## # ... with 970 more rows

### 9.5.2 确定匹配的位置

str_count("abababa", "aba")
## [1] 2
str_view_all("abababa", "aba")

### 9.5.3 提取匹配的内容

colours <- c(
"red", "orange", "yellow",
"green", "blue", "purple"
)
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"

colour_match 这里是一个字符串，放在pattern参数位置上也是正则表达式了,

str_view("abcd", "ab|cd")
str_view("abc", "a[bc]d")
more <- "It is hard to erase blue or red ink."
str_extract(more, pattern = colour_match)
## [1] "blue"
str_extract_all(more, pattern = colour_match)
## [[1]]
## [1] "blue" "red"
more <- sentences[str_count(sentences, colour_match) > 1]
more
## [1] "It is hard to erase blue or red ink."
## [2] "The green light in the brown box flickered."
## [3] "The sky in the west is tinged with orange red."

tibble(sentence = sentences) %>%
filter(str_count(sentences, colour_match) > 1)
## # A tibble: 3 x 1
##   sentence
##   <chr>
## 1 It is hard to erase blue or red ink.
## 2 The green light in the brown box flickered.
## 3 The sky in the west is tinged with orange red.

str_extract()提取匹配, 谁先匹配就提取谁

tibble(x = more) %>%
mutate(color = str_extract(x, colour_match))
## # A tibble: 3 x 2
##   x                                              color
##   <chr>                                          <chr>
## 1 It is hard to erase blue or red ink.           blue
## 2 The green light in the brown box flickered.    green
## 3 The sky in the west is tinged with orange red. orange

str_extract_all()提取全部匹配项

tibble(x = more) %>%
mutate(color = str_extract_all(x, colour_match))
## # A tibble: 3 x 2
##   x                                            color
##   <chr>                                        <list>
## 1 It is hard to erase blue or red ink.         <chr [2~
## 2 The green light in the brown box flickered.  <chr [2~
## 3 The sky in the west is tinged with orange r~ <chr [2~
tibble(x = more) %>%
mutate(color = str_extract_all(x, colour_match)) %>%
unnest(color)
## # A tibble: 6 x 2
##   x                                              color
##   <chr>                                          <chr>
## 1 It is hard to erase blue or red ink.           blue
## 2 It is hard to erase blue or red ink.           red
## 3 The green light in the brown box flickered.    green
## 4 The green light in the brown box flickered.    red
## 5 The sky in the west is tinged with orange red. orange
## 6 The sky in the west is tinged with orange red. red

### 9.5.4 替换匹配内容

x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple"  "p-ar"   "b-nana"

str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-"  "p--r"   "b-n-n-"

### 9.5.5 拆分字符串

lines <- "I love my country"
lines
## [1] "I love my country"
str_split(lines, " ")
## [[1]]
## [1] "I"       "love"    "my"      "country"
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)
##      [,1]      [,2]
## [2,] "Country" "NZ"
## [3,] "Age"     "35"

## 9.6 进阶部分

win <- c("Windows2000", "Windows", "Windows3.1")
str_view(win, "Windows(?=95|98|NT|2000)")
win <- c("Windows2000", "Windows", "Windows3.1")
str_view(win, "Windows(?!95|98|NT|2000)")

Windows后面的 () 是匹配条件，事实上，有四种情形：

• (?=pattern) 要求此位置的后面必须匹配表达式pattern
• (?!pattern) 要求此位置的后面不能匹配表达式pattern
• (?<=pattern) 要求此位置的前面必须匹配表达式pattern
• (?<!pattern) 要求此位置的前面不能匹配表达式pattern

### 9.6.2 look behind

win <- c("2000Windows", "Windows", "3.1Windows")
str_view(win, "(?<=95|98|NT|2000)Windows")
win <- c("2000Windows", "Windows", "3.1Windows")
str_view(win, "(?<!95|98|NT|2000)Windows")

## 9.7 案例分析

### 9.7.1 案例1

dt <- tibble(
x = 1:4,
y = c("wk 3", "week-1", "7", "w#9")
)
dt
## # A tibble: 4 x 2
##       x y
##   <int> <chr>
## 1     1 wk 3
## 2     2 week-1
## 3     3 7
## 4     4 w#9
dt %>%
mutate(
z = str_extract(y, "[0-9]")
)
## # A tibble: 4 x 3
##       x y      z
##   <int> <chr>  <chr>
## 1     1 wk 3   3
## 2     2 week-1 1
## 3     3 7      7
## 4     4 w#9    9

### 9.7.2 案例2

df <- data.frame(
x = seq_along(1:7),
y = c("2016123456", "20150513", "AB2016123456", "J2017000987", "B2017000987C", "aksdf", "2014")
)
df
##   x            y
## 1 1   2016123456
## 2 2     20150513
## 3 3 AB2016123456
## 4 4  J2017000987
## 5 5 B2017000987C
## 6 6        aksdf
## 7 7         2014
df %>%
mutate(
item = str_extract_all(y, "[A-Z]")
) %>%
tidyr::unnest(item)
## # A tibble: 5 x 3
##       x y            item
##   <int> <chr>        <chr>
## 1     3 AB2016123456 A
## 2     3 AB2016123456 B
## 3     4 J2017000987  J
## 4     5 B2017000987C B
## 5     5 B2017000987C C

### 9.7.3 案例3

tb <- tibble(x = c("I我", "love爱", "you你"))
tb
## # A tibble: 3 x 1
##   x
##   <chr>
## 1 I我
## 2 love爱
## 3 you你
tb %>%
tidyr::extract(
# x, c("en", "cn"), "([:alpha:]+)([^:alpha:]+)",
x, c("en", "cn"), "([a-zA-Z]+)([^a-zA-Z]+)",
remove = FALSE
)
## # A tibble: 3 x 3
##   x      en    cn
##   <chr>  <chr> <chr>
## 1 I我    I     我
## 2 love爱 love  爱
## 3 you你  you   你

### 9.7.4 案例4

df <- tibble(x = c("1-12周", "1-10周", "5-12周"))
df
## # A tibble: 3 x 1
##   x
##   <chr>
## 1 1-12周
## 2 1-10周
## 3 5-12周
df %>% extract(
x,
# c("start", "end", "cn"), "([:digit:]+)-([:digit:]+)([^:alpha:]+)",
c("start", "end", "cn"), "(\\d+)-(\\d+)(\\D+)",
remove = FALSE
)
## # A tibble: 3 x 4
##   x      start end   cn
##   <chr>  <chr> <chr> <chr>
## 1 1-12周 1     12    周
## 2 1-10周 1     10    周
## 3 5-12周 5     12    周

### 9.7.5 案例5

df <- tibble(
x = c("12W34", "AB2C46", "B217C", "akTs6df", "21WD4")
)
df %>%
mutate(item = str_extract_all(x, "(?<=[A-Z])[0-9]")) %>%
tidyr::unnest(item)
## # A tibble: 5 x 2
##   x      item
##   <chr>  <chr>
## 1 12W34  3
## 2 AB2C46 2
## 3 AB2C46 4
## 4 B217C  2
## 5 21WD4  4

• 如何提取大写字母后的连续数字，比如B217C后面的217
• 如何提取提取数字前的大写字母？
• 为什么第一个正则表达式返回结果为""
x <- "Roman numerals: MDCCCLXXXVIII"
str_match_all(x, "C?") # "?"的意思是匹配0次或者1次
## [[1]]
##       [,1]
##  [1,] ""
##  [2,] ""
##  [3,] ""
##  [4,] ""
##  [5,] ""
##  [6,] ""
##  [7,] ""
##  [8,] ""
##  [9,] ""
## [10,] ""
## [11,] ""
## [12,] ""
## [13,] ""
## [14,] ""
## [15,] ""
## [16,] ""
## [17,] ""
## [18,] ""
## [19,] "C"
## [20,] "C"
## [21,] "C"
## [22,] ""
## [23,] ""
## [24,] ""
## [25,] ""
## [26,] ""
## [27,] ""
## [28,] ""
## [29,] ""
## [30,] ""
str_match_all(x, "CC?")
## [[1]]
##      [,1]
## [1,] "CC"
## [2,] "C"

### 9.7.6 案例6

df <- tibble(
x = c("1234", "B246", "217C", "2357f", "21WD4")
)
df
## # A tibble: 5 x 1
##   x
##   <chr>
## 1 1234
## 2 B246
## 3 217C
## 4 2357f
## 5 21WD4
df %>%
mutate(num = str_match_all(x, "\\d")) %>%
unnest(num) %>%
mutate_at(vars(num), as.numeric) %>%
group_by(x) %>%
summarise(sum = sum(num))
## # A tibble: 5 x 2
##   x       sum
##   <chr> <dbl>
## 1 1234     10
## 2 217C     10
## 3 21WD4     7
## 4 2357f    17
## 5 B246     12

### 9.7.7 案例7

text <- "Quantum entanglement is a physical phenomenon that occurs when pairs or groups of particles are generated, interact, or share spatial proximity in ways such that the quantum state of each particle cannot be described independently of the state of the others, even when the particles are separated by a large distance."

pairs <-
tibble::tribble(
~item, ~code,
"Quantum entanglement", "A01",
"physical phenomenon", "A02",
"quantum state", "A03",
"quantum mechanics", "A04"
) %>%
tibble::deframe()

text %>% str_replace_all(pairs)
## [1] "A01 is a A02 that occurs when pairs or groups of particles are generated, interact, or share spatial proximity in ways such that the A03 of each particle cannot be described independently of the state of the others, even when the particles are separated by a large distance."

## 9.8 回答提问

## # A tibble: 8 x 2
##   <int> <chr>
## 1     1 Sichuan Univ, Coll Chem
## 2     2 Sichuan Univ, Coll Elect Engn
## 3     3 Sichuan Univ, Dept Phys
## 4     4 Sichuan Univ, Coll Life Sci
## 5     6 Sichuan Univ, Food Engn
## 6     7 Sichuan Univ, Coll Phys
## 7     8 Sichuan Univ, Sch Business
## 8     9 Wuhan Univ, Mat Sci
d %>%
dplyr::mutate(
) %>%
tidyr::unnest(coll, keep_empty = TRUE)
## # A tibble: 8 x 3
##   <int> <chr>                         <chr>
## 1     1 Sichuan Univ, Coll Chem       " Coll Chem"
## 2     2 Sichuan Univ, Coll Elect Engn " Coll Elect Eng~
## 3     3 Sichuan Univ, Dept Phys       " Dept Phys"
## 4     4 Sichuan Univ, Coll Life Sci   " Coll Life Sci"
## 5     6 Sichuan Univ, Food Engn       " Food Engn"
## 6     7 Sichuan Univ, Coll Phys       " Coll Phys"
## 8     9 Wuhan Univ, Mat Sci            <NA>

d %>% mutate(
)
d %>% tidyr::separate(
)
d %>%
)
library(inferregex) # remotes::install_github("daranzolin/inferregex")
infer_regex(s)$regex ## [1] "^[a-z]{4}-\\d{4}-[a-z]{2}\\d$"