7 英文文本分析
代码提供:谢钰莹 倪云 谢桂芳
主要内容:
- 1、整洁文字
- 2、词频分析及可视化
- 3、词云
- 4、分析单词和文档频率:tf-idf
- 5、案例分析:挖掘NASA元数据
7.1 整洁文字
载入Jane Austen作品的R包
library(janeaustenr)
library(dplyr)
建立行号、章节号
library(stringr)
<- austen_books() %>%
original_books # %>%是管道函数,可将前一步的结果直接传参给下一步函数
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>% #建立行号、章节号
ungroup()
original_books
## # A tibble: 73,422 × 4
## text book linenumber chapter
## <chr> <fct> <int> <int>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility 1 0
## 2 "" Sense & Sensibility 2 0
## 3 "by Jane Austen" Sense & Sensibility 3 0
## 4 "" Sense & Sensibility 4 0
## 5 "(1811)" Sense & Sensibility 5 0
## 6 "" Sense & Sensibility 6 0
## 7 "" Sense & Sensibility 7 0
## 8 "" Sense & Sensibility 8 0
## 9 "" Sense & Sensibility 9 0
## 10 "CHAPTER 1" Sense & Sensibility 10 1
## # ℹ 73,412 more rows
用unnest_tokens函数分词(每行一词)
library(tidytext)
<- original_books %>%
tidy_books unnest_tokens(word, text) #分词
tidy_books
## # A tibble: 725,055 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # ℹ 725,045 more rows
用anti_join函数删去停用词(如”the”“to”“of”等无实义词)
data(stop_words)
<- tidy_books %>%
tidy_books anti_join(stop_words)
## Joining with `by = join_by(word)`
7.2 词频count
统计词频(查找书中最常用的词汇)
%>%
tidy_books count(word, sort = TRUE)
## # A tibble: 13,914 × 2
## word n
## <chr> <int>
## 1 miss 1855
## 2 time 1337
## 3 fanny 862
## 4 dear 822
## 5 lady 817
## 6 sir 806
## 7 day 797
## 8 emma 787
## 9 sister 727
## 10 house 699
## # ℹ 13,904 more rows
词频可视化(利用ggplot2)
library(ggplot2)
%>%
tidy_books count(word, sort = TRUE) %>%
filter(n > 600) %>% #出现频次在600次以上
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL)
7.3 词云
基础操作
library(wordcloud)
## Loading required package: RColorBrewer
%>%
tidy_books anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100)) #限定词云的最大数量为100
## Joining with `by = join_by(word)`
进阶版(用颜色区分词性)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
%>%
tidy_books inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 131015 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
拓展
借助文本挖掘,你可以了解一部小说文本的——
- 高频词(词频、词云)
- 最常用的正负面词语(词频+情感)
- 全文情感变化趋势(情感分析可视化)
- 同其他小说的风格差异
示例1 最常用的正负面词语(基于前文bing词典的情感分析结果)
library(tidyr)
<- tidy_books %>%
jane_austen_sentiment inner_join(get_sentiments("bing")) %>%
count(book,index = linenumber %/% 80,sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 131015 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
#直接呈现结果
<- tidy_books %>%
bing_word_counts inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 131015 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
bing_word_counts
## # A tibble: 2,555 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 happy positive 534
## 3 love positive 495
## 4 pleasure positive 462
## 5 poor negative 424
## 6 happiness positive 369
## 7 comfort positive 292
## 8 doubt negative 281
## 9 affection positive 272
## 10 perfectly positive 271
## # ℹ 2,545 more rows
#进阶版:结果可视化
%>%
bing_word_counts group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
注:1.文中所用宏包需要用install.packages()安装后才能调用;2.运行带有管道函数%>%的命令时最好上下文都运行,避免其找不到数据凭借;3.文本情绪分析有时需转变词汇的词性或对文本进行分句分析,避免如no good(消极)被分为no和good(积极)的情况而影响最终结果的准确性。分析时应多留意修饰词和含义丰富的词汇,有时词性可能会随着含义而改变。
7.4 分析单词和文档频率:tf-idf
7.4.1 简.奥斯汀小说中的术语频率
library(dplyr)
library(janeaustenr)
library(tidytext)
<- austen_books() %>%
book_words unnest_tokens(word, text) %>%
count(book, word, sort = TRUE)
<- book_words %>%
total_words group_by(book) %>%
summarize(total = sum(n))
<- left_join(book_words, total_words) book_words
## Joining with `by = join_by(book)`
book_words
## # A tibble: 40,379 × 4
## book word n total
## <fct> <chr> <int> <int>
## 1 Mansfield Park the 6206 160460
## 2 Mansfield Park to 5475 160460
## 3 Mansfield Park and 5438 160460
## 4 Emma to 5239 160996
## 5 Emma the 5201 160996
## 6 Emma and 4896 160996
## 7 Mansfield Park of 4778 160460
## 8 Pride & Prejudice the 4331 122204
## 9 Emma of 4291 160996
## 10 Pride & Prejudice to 4162 122204
## # ℹ 40,369 more rows
library(ggplot2)
ggplot(book_words, aes(n/total, fill = book)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 896 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 6 rows containing missing values (`geom_bar()`).
7.4.2 Zipf’s law
<- book_words %>%
freq_by_rank group_by(book) %>%
mutate(rank = row_number(),
`term frequency` = n/total) %>%
ungroup()
freq_by_rank
## # A tibble: 40,379 × 6
## book word n total rank `term frequency`
## <fct> <chr> <int> <int> <int> <dbl>
## 1 Mansfield Park the 6206 160460 1 0.0387
## 2 Mansfield Park to 5475 160460 2 0.0341
## 3 Mansfield Park and 5438 160460 3 0.0339
## 4 Emma to 5239 160996 1 0.0325
## 5 Emma the 5201 160996 2 0.0323
## 6 Emma and 4896 160996 3 0.0304
## 7 Mansfield Park of 4778 160460 4 0.0298
## 8 Pride & Prejudice the 4331 122204 1 0.0354
## 9 Emma of 4291 160996 4 0.0267
## 10 Pride & Prejudice to 4162 122204 2 0.0341
## # ℹ 40,369 more rows
%>%
freq_by_rank ggplot(aes(rank, `term frequency`, color = book)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
<- freq_by_rank %>%
rank_subset filter(rank < 500,
> 10)
rank
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Coefficients:
## (Intercept) log10(rank)
## -0.6226 -1.1125
%>%
freq_by_rank ggplot(aes(rank, `term frequency`, color = book)) +
geom_abline(intercept = -0.62, slope = -1.1,
color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
注:我们在简.奥斯汀的小说语料库中发现了一个与经典版齐普夫定律相近的结果。我们在这里看到的高级语言的偏差在许多语言中并不罕见;一个语言语料库通常包含的稀有词比单一幂律所预测的要少。低等级的偏差更不寻常。简.奥斯汀使用的最常用词的百分比低于许多语言集合。这种分析可以扩展到比较作者,或者比较任何其他文本集合,它可以简单地通过使用简洁的数据原则来实现。
7.4.3 bind_tf_idf ()函数
注:ti-idf的想法是通过减少常用词的权重,增加文档集合或语料中不常用的词的权重,来找到每个文档内容中的重要词。以下我们将从简.奥斯汀的小说集合入手,通过计算ti-idf来在文本中找到重要但不太常见的单词。
<- book_words %>%
book_tf_idf bind_tf_idf(word, book, n)
book_tf_idf
## # A tibble: 40,379 × 7
## book word n total tf idf tf_idf
## <fct> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Mansfield Park the 6206 160460 0.0387 0 0
## 2 Mansfield Park to 5475 160460 0.0341 0 0
## 3 Mansfield Park and 5438 160460 0.0339 0 0
## 4 Emma to 5239 160996 0.0325 0 0
## 5 Emma the 5201 160996 0.0323 0 0
## 6 Emma and 4896 160996 0.0304 0 0
## 7 Mansfield Park of 4778 160460 0.0298 0 0
## 8 Pride & Prejudice the 4331 122204 0.0354 0 0
## 9 Emma of 4291 160996 0.0267 0 0
## 10 Pride & Prejudice to 4162 122204 0.0341 0 0
## # ℹ 40,369 more rows
以下我们来看看简.奥斯汀作品中的高tf-idf术语
%>%
book_tf_idf select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 40,379 × 6
## book word n tf idf tf_idf
## <fct> <chr> <int> <dbl> <dbl> <dbl>
## 1 Sense & Sensibility elinor 623 0.00519 1.79 0.00931
## 2 Sense & Sensibility marianne 492 0.00410 1.79 0.00735
## 3 Mansfield Park crawford 493 0.00307 1.79 0.00551
## 4 Pride & Prejudice darcy 373 0.00305 1.79 0.00547
## 5 Persuasion elliot 254 0.00304 1.79 0.00544
## 6 Emma emma 786 0.00488 1.10 0.00536
## 7 Northanger Abbey tilney 196 0.00252 1.79 0.00452
## 8 Emma weston 389 0.00242 1.79 0.00433
## 9 Pride & Prejudice bennet 294 0.00241 1.79 0.00431
## 10 Persuasion wentworth 191 0.00228 1.79 0.00409
## # ℹ 40,369 more rows
接下来让我们看看这些高ti-idf单词的可视化
library(forcats)
%>%
book_tf_idf group_by(book) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)
7.5 案例分析:挖掘NASA元数据
NASA如何组织数据?(首先让我们下载JSON文件并查看元数据中储存的内容的名称。)
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
#有时候网站下载数据很慢,改一下下载时间(默认是60s)
getOption('timeout')
## [1] 60
options(timeout=10000)
<- fromJSON("https://data.nasa.gov/data.json")
metadata names(metadata$dataset)
## [1] "accessLevel" "landingPage"
## [3] "bureauCode" "issued"
## [5] "@type" "modified"
## [7] "references" "keyword"
## [9] "contactPoint" "publisher"
## [11] "identifier" "description"
## [13] "title" "programCode"
## [15] "distribution" "accrualPeriodicity"
## [17] "theme" "license"
## [19] "citation" "temporal"
## [21] "release-place" "series-name"
## [23] "graphic-preview-description" "creator"
## [25] "graphic-preview-file" "spatial"
## [27] "language" "data-presentation-form"
## [29] "dataQuality" "editor"
## [31] "issue-identification" "describedBy"
## [33] "describedByType" "rights"
## [35] "systemOfRecords"
class(metadata$dataset$title)
## [1] "character"
class(metadata$dataset$description)
## [1] "character"
class(metadata$dataset$keyword)
## [1] "list"
争论和整理数据(我们将为title、description和keyword分别设置整齐的数据框架,保留每个框架的数据集ID,以便我们可以在以后的分析中更具需要连接它们。)
library(dplyr)
<- tibble(id = metadata$dataset$`_id`$`$oid`,
nasa_title title = metadata$dataset$title)
nasa_title
## # A tibble: 22,137 × 1
## title
## <chr>
## 1 "ROSETTA-ORBITER EARTH RPCMAG 2 EAR2 RAW V3.0"
## 2 "NEAR EROS RADIO SCIENCE DATA SET - EROS/ORBIT V1.0"
## 3 "NEW HORIZONS\n LEISA KEM1\n CALIBRATED V2.0"
## 4 "ROSETTA-ORBITER 67P RSI 1/2/3\n COMET E…
## 5 "ASTEROID OCCULTATIONS V14.0"
## 6 "MetOp-A ASCAT ESDR Level 2 Modeled Ocean Surface Auxiliary Fields Version 1…
## 7 "NARSTO SHEMP Particulate Matter Composition Data, Canada, 2000-2002"
## 8 "VOYAGER 2 JUPITER MAGNETOMETER RESAMPLED DATA 1.92 SEC"
## 9 "Fire Particulate Emissions from Combined VIIRS and AHI Data for Indonesia, …
## 10 "Sounder SIPS: Suomi NPP CrIMSS Level 3 Comprehensive Quality Control Gridde…
## # ℹ 22,127 more rows
<- tibble(id = metadata$dataset$`_id`$`$oid`,
nasa_desc desc = metadata$dataset$description)
%>%
nasa_desc select(desc) %>%
sample_n(5)
## # A tibble: 5 × 1
## desc
## <chr>
## 1 "This dataset contains raw calibration images acquired by the High Resolution…
## 2 "Spectroscopy of Jupiter, Saturnian rings, atmospheres and satellites for det…
## 3 "Aquarius Level 3 sea surface spiciness standard mapped image data contains g…
## 4 "This CODMAC level 4 data set contains solar stray light corrected, radiometr…
## 5 "This data set contains Raw data taken by the New Horizons Stu…
现在我们可以为关键字构建整洁的数据框架,在本例中,我们将使用tidyr中的unnest()函数。
library(tidyr)
<- tibble(id = metadata$dataset$`_id`$`$oid`,
nasa_keyword keyword = metadata$dataset$keyword) %>%
unnest(keyword)
nasa_keyword
## # A tibble: 114,573 × 1
## keyword
## <chr>
## 1 unknown
## 2 international rosetta mission
## 3 earth
## 4 near earth asteroid rendezvous
## 5 eros
## 6 vega
## 7 new horizons kuiper belt extended mission
## 8 international rosetta mission
## 9 67p/churyumov-gerasimenko 1 (1969 r1)
## 10 satellite
## # ℹ 114,563 more rows
library(tidytext)
<- nasa_title %>%
nasa_title unnest_tokens(word, title) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
<- nasa_desc %>%
nasa_desc unnest_tokens(word, desc) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
nasa_title
## # A tibble: 202,093 × 1
## word
## <chr>
## 1 rosetta
## 2 orbiter
## 3 earth
## 4 rpcmag
## 5 2
## 6 ear2
## 7 raw
## 8 v3.0
## 9 eros
## 10 radio
## # ℹ 202,083 more rows
nasa_desc
## # A tibble: 1,578,871 × 1
## word
## <chr>
## 1 dataset
## 2 edited
## 3 raw
## 4 data
## 5 earth
## 6 flyby
## 7 ear2
## 8 closest
## 9 approach
## 10 ca
## # ℹ 1,578,861 more rows
一些初步的简单勘探(NASA数据集标题中最常见的词是什么?我们可以使用dplyr中的count()来检查这一点。)
%>%
nasa_title count(word, sort = TRUE)
## # A tibble: 12,434 × 2
## word n
## <chr> <int>
## 1 v1.0 6183
## 2 data 4528
## 3 2 4125
## 4 rosetta 4031
## 5 1 3928
## 6 orbiter 3887
## 7 3 3780
## 8 67p 2676
## 9 global 1735
## 10 ges 1706
## # ℹ 12,424 more rows
描述如何?
%>%
nasa_desc count(word, sort = TRUE)
## # A tibble: 35,578 × 2
## word n
## <chr> <int>
## 1 data 52494
## 2 set 13047
## 3 product 10406
## 4 2 9701
## 5 1 8485
## 6 version 8032
## 7 surface 7651
## 8 global 7506
## 9 level 7223
## 10 time 7041
## # ℹ 35,568 more rows
<- tibble(word = c(as.character(1:10),
my_stopwords "v1", "v03", "l2", "l3", "l4", "v5.2.0",
"v003", "v004", "v005", "v006", "v7"))
<- nasa_title %>%
nasa_title anti_join(my_stopwords)
## Joining with `by = join_by(word)`
<- nasa_desc %>%
nasa_desc anti_join(my_stopwords)
## Joining with `by = join_by(word)`
最常见的关键词是什么?
%>%
nasa_keyword group_by(keyword) %>%
count(sort = TRUE)
## # A tibble: 9,093 × 2
## # Groups: keyword [9,093]
## keyword n
## <chr> <int>
## 1 earth science 9666
## 2 atmosphere 4133
## 3 international rosetta mission 3806
## 4 67p/churyumov-gerasimenko 1 (1969 r1) 2977
## 5 land surface 2218
## 6 oceans 1905
## 7 spectral/engineering 1551
## 8 biosphere 1409
## 9 atmospheric water vapor 1339
## 10 mars 1321
## # ℹ 9,083 more rows
我们可能希望将所有的关键字都改写为大写或小写,以消除重复项,如“OCEANS”和“Oceans”,这里我们可以这么做。)
<- nasa_keyword %>%
nasa_keyword mutate(keyword = toupper(keyword))