8 中文文本分析
代码提供: 张柏珊 王楠
主要内容:
-1.安装拓展包和导入 -2.分词 -3.运用SQL -4.词云 -5.词频可视化
8.1 安装拓展包和导入
8.1.1 安装拓展包
library(jiebaR)
## Loading required package: jiebaRD
library(tidyverse)
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(wordcloud2)
library(dplyr)
8.1.2 导入文档,建议用记事本,格式为UTTF-8 运用scan函数
<- scan('zfgzbg.txt', sep = '\n', encoding = 'UTF-8', what = '') f
8.2 结巴分词处理
8.2.1 制作词表
8.2.1.1 标停止词
<- worker(user = "users.txt",stop_word = "stopwords.txt")
engine1 <- c("被","怎么","还是","多少","得", "吗","给",
stopwords_CN "年","月","还","个","能", "日","什么","做","没","啊",
"的", "了", "在", "是", "我", "有", "和", "就","不",
"人", "都", "一", "一个", "上", "也", "很", "到", "说",
"要", "去", "你","会", "着", "没有", "看", "好",
"自己", "这", "等","各位代表")
library('jiebaR')
8.2.1.2 变量seg保存文章所有的词语
<- qseg[f] seg
## Warning in `[.qseg`(qseg, f): Quick mode is depreciated, and is scheduled to be
## remove in v0.11.0. If you want to keep this feature, please submit a issue on
## GitHub page to let me know.
8.2.1.3 保留字数大于1的词语
<- seg[nchar(seg)>1] seg
8.2.1.4 用数据框函数讲这些词汇排成一列方便处理
<- data.frame(seg) m1
8.3 运用SQL
8.3.1 安装并载入sqldf程序包 >group by“根据一定的规则进行分组”,通过一定的规则将一个数据集划分成若干个笑的区域,然后针对若干个小区域进行数据处理 >count(1)来计数 >select检索数据
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: /opt/X11/lib/libSM.6.dylib
## Referenced from: /Library/Frameworks/R.framework/Versions/4.1-arm64/Resources/modules/R_X11.so
## Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-17.jdk/Contents/Home/lib/server/libSM.6.dylib' (no such file)
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## library/tcltk/libs//tcltk.so'' had status 1
## Could not load tcltk. Will use slower R code instead.
## Loading required package: RSQLite
<- sqldf('select seg,count(1)as freg from m1 group by seg')
m2 class(m2)
## [1] "data.frame"
8.3.2 按顺序排列
=m2[order(m2$freg,decreasing = TRUE),] m3
8.3.3 抽取频次为前一百的词语
=m3[1:100,] m4
8.4 绘制词云
wordcloud2(m4,size=1,shape="star")
8.5 词频可视化
8.5.1 barplot绘制排名前30的高频词 第一个参数源为数据源,第二个参数源为标签
barplot(m4$freg[1:30] , names.arg = m4$seg[1:30], col = rainbow(30))+
theme(text = element_text(family='Kai'))
## NULL
8.5.2 利用pie函数绘制饼图对高频词语进行可视化
提取数据
<- m4$freg[1:30]
number <- m4$seg[1:30] label
不带比例的饼图
<- pie(number,label,main = "TOP30词语")+
a theme(text = element_text(family='Kai'))
带比例的饼图
pie(number, a$label, main ="TOP30词语" )