7 英文文本分析
代码提供:谢钰莹 倪云 谢桂芳
主要内容:
- 1、整洁文字
- 2、词频分析及可视化
- 3、词云
- 4、分析单词和文档频率:tf-idf
- 5、案例分析:挖掘NASA元数据
7.1 整洁文字
载入Jane Austen作品的R包
library(janeaustenr)
library(dplyr)
建立行号、章节号
library(stringr)
<- austen_books() %>%
original_books # %>%是管道函数,可将前一步的结果直接传参给下一步函数
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>% #建立行号、章节号
ungroup()
original_books
## # A tibble: 73,422 × 4
## text book linenumber chapter
## <chr> <fct> <int> <int>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility 1 0
## 2 "" Sense & Sensibility 2 0
## 3 "by Jane Austen" Sense & Sensibility 3 0
## 4 "" Sense & Sensibility 4 0
## 5 "(1811)" Sense & Sensibility 5 0
## 6 "" Sense & Sensibility 6 0
## 7 "" Sense & Sensibility 7 0
## 8 "" Sense & Sensibility 8 0
## 9 "" Sense & Sensibility 9 0
## 10 "CHAPTER 1" Sense & Sensibility 10 1
## # … with 73,412 more rows
用unnest_tokens函数分词(每行一词)
library(tidytext)
<- original_books %>%
tidy_books unnest_tokens(word, text) #分词
tidy_books
## # A tibble: 725,055 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # … with 725,045 more rows
用anti_join函数删去停用词(如”the”“to”“of”等无实义词)
data(stop_words)
<- tidy_books %>%
tidy_books anti_join(stop_words)
## Joining, by = "word"
7.2 词频count
统计词频(查找书中最常用的词汇)
%>%
tidy_books count(word, sort = TRUE)
## # A tibble: 13,914 × 2
## word n
## <chr> <int>
## 1 miss 1855
## 2 time 1337
## 3 fanny 862
## 4 dear 822
## 5 lady 817
## 6 sir 806
## 7 day 797
## 8 emma 787
## 9 sister 727
## 10 house 699
## # … with 13,904 more rows
词频可视化(利用ggplot2)
library(ggplot2)
%>%
tidy_books count(word, sort = TRUE) %>%
filter(n > 600) %>% #出现频次在600次以上
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL)
7.3 词云
基础操作
library(wordcloud)
## Loading required package: RColorBrewer
%>%
tidy_books anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100)) #限定词云的最大数量为100
## Joining, by = "word"
进阶版(用颜色区分词性)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
%>%
tidy_books inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
拓展
借助文本挖掘,你可以了解一部小说文本的——
- 高频词(词频、词云)
- 最常用的正负面词语(词频+情感)
- 全文情感变化趋势(情感分析可视化)
- 同其他小说的风格差异
示例1 最常用的正负面词语(基于前文bing词典的情感分析结果)
library(tidyr)
<- tidy_books %>%
jane_austen_sentiment inner_join(get_sentiments("bing")) %>%
count(book,index = linenumber %/% 80,sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
#直接呈现结果
<- tidy_books %>%
bing_word_counts inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,555 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 happy positive 534
## 3 love positive 495
## 4 pleasure positive 462
## 5 poor negative 424
## 6 happiness positive 369
## 7 comfort positive 292
## 8 doubt negative 281
## 9 affection positive 272
## 10 perfectly positive 271
## # … with 2,545 more rows
#进阶版:结果可视化
%>%
bing_word_counts group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
注:1.文中所用宏包需要用install.packages()安装后才能调用;2.运行带有管道函数%>%的命令时最好上下文都运行,避免其找不到数据凭借;3.文本情绪分析有时需转变词汇的词性或对文本进行分句分析,避免如no good(消极)被分为no和good(积极)的情况而影响最终结果的准确性。分析时应多留意修饰词和含义丰富的词汇,有时词性可能会随着含义而改变。
7.4 分析单词和文档频率:tf-idf
7.4.1 简.奥斯汀小说中的术语频率
library(dplyr)
library(janeaustenr)
library(tidytext)
<- austen_books() %>%
book_words unnest_tokens(word, text) %>%
count(book, word, sort = TRUE)
<- book_words %>%
total_words group_by(book) %>%
summarize(total = sum(n))
<- left_join(book_words, total_words) book_words
## Joining, by = "book"
book_words
## # A tibble: 40,379 × 4
## book word n total
## <fct> <chr> <int> <int>
## 1 Mansfield Park the 6206 160460
## 2 Mansfield Park to 5475 160460
## 3 Mansfield Park and 5438 160460
## 4 Emma to 5239 160996
## 5 Emma the 5201 160996
## 6 Emma and 4896 160996
## 7 Mansfield Park of 4778 160460
## 8 Pride & Prejudice the 4331 122204
## 9 Emma of 4291 160996
## 10 Pride & Prejudice to 4162 122204
## # … with 40,369 more rows
library(ggplot2)
ggplot(book_words, aes(n/total, fill = book)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 896 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
7.4.2 Zipf’s law
<- book_words %>%
freq_by_rank group_by(book) %>%
mutate(rank = row_number(),
`term frequency` = n/total) %>%
ungroup()
freq_by_rank
## # A tibble: 40,379 × 6
## book word n total rank `term frequency`
## <fct> <chr> <int> <int> <int> <dbl>
## 1 Mansfield Park the 6206 160460 1 0.0387
## 2 Mansfield Park to 5475 160460 2 0.0341
## 3 Mansfield Park and 5438 160460 3 0.0339
## 4 Emma to 5239 160996 1 0.0325
## 5 Emma the 5201 160996 2 0.0323
## 6 Emma and 4896 160996 3 0.0304
## 7 Mansfield Park of 4778 160460 4 0.0298
## 8 Pride & Prejudice the 4331 122204 1 0.0354
## 9 Emma of 4291 160996 4 0.0267
## 10 Pride & Prejudice to 4162 122204 2 0.0341
## # … with 40,369 more rows
%>%
freq_by_rank ggplot(aes(rank, `term frequency`, color = book)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
<- freq_by_rank %>%
rank_subset filter(rank < 500,
> 10)
rank
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Coefficients:
## (Intercept) log10(rank)
## -0.6226 -1.1125
%>%
freq_by_rank ggplot(aes(rank, `term frequency`, color = book)) +
geom_abline(intercept = -0.62, slope = -1.1,
color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
注:我们在简.奥斯汀的小说语料库中发现了一个与经典版齐普夫定律相近的结果。我们在这里看到的高级语言的偏差在许多语言中并不罕见;一个语言语料库通常包含的稀有词比单一幂律所预测的要少。低等级的偏差更不寻常。简.奥斯汀使用的最常用词的百分比低于许多语言集合。这种分析可以扩展到比较作者,或者比较任何其他文本集合,它可以简单地通过使用简洁的数据原则来实现。
7.4.3 bind_tf_idf ()函数
注:ti-idf的想法是通过减少常用词的权重,增加文档集合或语料中不常用的词的权重,来找到每个文档内容中的重要词。以下我们将从简.奥斯汀的小说集合入手,通过计算ti-idf来在文本中找到重要但不太常见的单词。
<- book_words %>%
book_tf_idf bind_tf_idf(word, book, n)
book_tf_idf
## # A tibble: 40,379 × 7
## book word n total tf idf tf_idf
## <fct> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Mansfield Park the 6206 160460 0.0387 0 0
## 2 Mansfield Park to 5475 160460 0.0341 0 0
## 3 Mansfield Park and 5438 160460 0.0339 0 0
## 4 Emma to 5239 160996 0.0325 0 0
## 5 Emma the 5201 160996 0.0323 0 0
## 6 Emma and 4896 160996 0.0304 0 0
## 7 Mansfield Park of 4778 160460 0.0298 0 0
## 8 Pride & Prejudice the 4331 122204 0.0354 0 0
## 9 Emma of 4291 160996 0.0267 0 0
## 10 Pride & Prejudice to 4162 122204 0.0341 0 0
## # … with 40,369 more rows
以下我们来看看简.奥斯汀作品中的高tf-idf术语
%>%
book_tf_idf select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 40,379 × 6
## book word n tf idf tf_idf
## <fct> <chr> <int> <dbl> <dbl> <dbl>
## 1 Sense & Sensibility elinor 623 0.00519 1.79 0.00931
## 2 Sense & Sensibility marianne 492 0.00410 1.79 0.00735
## 3 Mansfield Park crawford 493 0.00307 1.79 0.00551
## 4 Pride & Prejudice darcy 373 0.00305 1.79 0.00547
## 5 Persuasion elliot 254 0.00304 1.79 0.00544
## 6 Emma emma 786 0.00488 1.10 0.00536
## 7 Northanger Abbey tilney 196 0.00252 1.79 0.00452
## 8 Emma weston 389 0.00242 1.79 0.00433
## 9 Pride & Prejudice bennet 294 0.00241 1.79 0.00431
## 10 Persuasion wentworth 191 0.00228 1.79 0.00409
## # … with 40,369 more rows
接下来让我们看看这些高ti-idf单词的可视化
library(forcats)
%>%
book_tf_idf group_by(book) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)
7.5 案例分析:挖掘NASA元数据
NASA如何组织数据?(首先让我们下载JSON文件并查看元数据中储存的内容的名称。)
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
<- fromJSON("https://data.nasa.gov/data.json")
metadata names(metadata$dataset)
## [1] "accessLevel" "landingPage"
## [3] "bureauCode" "issued"
## [5] "@type" "modified"
## [7] "references" "keyword"
## [9] "contactPoint" "publisher"
## [11] "identifier" "description"
## [13] "title" "programCode"
## [15] "distribution" "accrualPeriodicity"
## [17] "theme" "temporal"
## [19] "spatial" "license"
## [21] "citation" "graphic-preview-description"
## [23] "graphic-preview-file" "language"
## [25] "data-presentation-form" "release-place"
## [27] "series-name" "creator"
## [29] "dataQuality" "editor"
## [31] "issue-identification" "describedBy"
## [33] "describedByType" "rights"
## [35] "systemOfRecords"
class(metadata$dataset$title)
## [1] "character"
class(metadata$dataset$description)
## [1] "character"
class(metadata$dataset$keyword)
## [1] "list"
争论和整理数据(我们将为title、description和keyword分别设置整齐的数据框架,保留每个框架的数据集ID,以便我们可以在以后的分析中更具需要连接它们。)
library(dplyr)
<- tibble(id = metadata$dataset$`_id`$`$oid`,
nasa_title title = metadata$dataset$title)
nasa_title
## # A tibble: 45,366 × 1
## title
## <chr>
## 1 "ROSETTA-ORBITER EARTH RPCMAG 2 EAR2 RAW V3.0"
## 2 "Sealed Planetary Return Canister (SPRC), Phase II"
## 3 "Enhanced ORCA and CLARREO Depolarizers Using AR Microstructures, Phase I"
## 4 "GPM SSMIS on F17 (GPROF) Climate-based Radiometer Precipitation Profiling L…
## 5 "NEAR EROS RADIO SCIENCE DATA SET - EROS/ORBIT V1.0"
## 6 "A Constraint-Based Geospatial Data Integration System for Wildfire Manageme…
## 7 "MODIS/Terra+Aqua BRDF/Albedo Black Sky Albedo Band2 Daily L3 Global 30ArcSe…
## 8 "NEW HORIZONS\n LEISA KEM1\n CALIBRATED V2.0"
## 9 "Goggle-Based Visual Field Device"
## 10 "Highly Accurate Sensor for High-Purity Oxygen Determination, Phase II"
## # … with 45,356 more rows
<- tibble(id = metadata$dataset$`_id`$`$oid`,
nasa_desc desc = metadata$dataset$description)
%>%
nasa_desc select(desc) %>%
sample_n(5)
## # A tibble: 5 × 1
## desc
## <chr>
## 1 "Time periods when the >100 MeV fluxes computed via the light-bucket method d…
## 2 "The Mars Express SPICAM level 1A IR data set contains clean measurements fro…
## 3 "The ABLE 2A and 2B (Atmospheric Boundary Layer Experiments) data consists of…
## 4 "We propose to investigate the feasibility of developing a low noise,\ntwo-si…
## 5 "The Moderate-resolution Imaging Spectroradiometer (MODIS) is a scientific in…
现在我们可以为关键字构建整洁的数据框架,在本例中,我们将使用tidyr中的unnest()函数。
library(tidyr)
<- tibble(id = metadata$dataset$`_id`$`$oid`,
nasa_keyword keyword = metadata$dataset$keyword) %>%
unnest(keyword)
nasa_keyword
## # A tibble: 196,883 × 1
## keyword
## <chr>
## 1 earth
## 2 unknown
## 3 international rosetta mission
## 4 jet propulsion laboratory
## 5 completed
## 6 completed
## 7 goddard space flight center
## 8 earth science
## 9 atmosphere
## 10 atmospheric water vapor
## # … with 196,873 more rows
library(tidytext)
<- nasa_title %>%
nasa_title unnest_tokens(word, title) %>%
anti_join(stop_words)
## Joining, by = "word"
<- nasa_desc %>%
nasa_desc unnest_tokens(word, desc) %>%
anti_join(stop_words)
## Joining, by = "word"
nasa_title
## # A tibble: 406,875 × 1
## word
## <chr>
## 1 rosetta
## 2 orbiter
## 3 earth
## 4 rpcmag
## 5 2
## 6 ear2
## 7 raw
## 8 v3.0
## 9 sealed
## 10 planetary
## # … with 406,865 more rows
nasa_desc
## # A tibble: 4,027,737 × 1
## word
## <chr>
## 1 dataset
## 2 edited
## 3 raw
## 4 data
## 5 earth
## 6 flyby
## 7 ear2
## 8 closest
## 9 approach
## 10 ca
## # … with 4,027,727 more rows
一些初步的简单勘探(NASA数据集标题中最常见的词是什么?我们可以使用dplyr中的count()来检查这一点。)
%>%
nasa_title count(word, sort = TRUE)
## # A tibble: 18,353 × 2
## word n
## <chr> <int>
## 1 phase 8686
## 2 data 6879
## 3 v1.0 6197
## 4 1 5629
## 5 2 5270
## 6 3 4884
## 7 ges 4052
## 8 disc 4050
## 9 rosetta 4031
## 10 orbiter 3890
## # … with 18,343 more rows
描述如何?
%>%
nasa_desc count(word, sort = TRUE)
## # A tibble: 58,429 × 2
## word n
## <chr> <int>
## 1 data 92713
## 2 system 24755
## 3 product 22007
## 4 2 20899
## 5 set 18802
## 6 1 17467
## 7 phase 16678
## 8 surface 16245
## 9 resolution 15345
## 10 level 14911
## # … with 58,419 more rows
<- tibble(word = c(as.character(1:10),
my_stopwords "v1", "v03", "l2", "l3", "l4", "v5.2.0",
"v003", "v004", "v005", "v006", "v7"))
<- nasa_title %>%
nasa_title anti_join(my_stopwords)
## Joining, by = "word"
<- nasa_desc %>%
nasa_desc anti_join(my_stopwords)
## Joining, by = "word"
最常见的关键词是什么?
%>%
nasa_keyword group_by(keyword) %>%
count(sort = TRUE)
## # A tibble: 9,308 × 2
## # Groups: keyword [9,308]
## keyword n
## <chr> <int>
## 1 earth science 20152
## 2 completed 9021
## 3 atmosphere 8616
## 4 national geospatial data asset 5858
## 5 ngda 5858
## 6 oceans 4821
## 7 land surface 4245
## 8 international rosetta mission 3806
## 9 spectral/engineering 3251
## 10 67p/churyumov-gerasimenko 1 (1969 r1) 2977
## # … with 9,298 more rows
我们可能希望将所有的关键字都改写为大写或小写,以消除重复项,如“OCEANS”和“Oceans”,这里我们可以这么做。)
<- nasa_keyword %>%
nasa_keyword mutate(keyword = toupper(keyword))