1.准备工作
library(rvest)
## Warning: package 'rvest' was built under R version 3.5.3
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.5.2
library(magrittr)
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.2
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.5.3
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.5.3
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.5.3
1.1打开网页节点
tag_title <- ".J_biaoti" # 论文题目
tag_author<-".J_author_cn" #论文作者
tag_institution<-"//td"#作者单位
tag_abstract<-".J_zhaiyao"#摘要
2.爬取网络
由于心理科学2018年度文章的网页编号是从https://www.psysci.org/CN/abstract/abstract10038到https://www.psysci.org/CN/abstract/abstract10258,因此,我们使用for loop来循环爬取网页。
首先,设定一个空的数据框,以保存爬取的网页数据
psysin_2018_1<-data.frame(title = NULL,author = NULL,
institution = NULL, keywords = NULL,
stringsAsFactors = F)
其次,进行循环爬取
for (n in 10038:10258) {
myurl <- paste0("https://www.psysci.org/CN/abstract/abstract", n, ".shtml", sep = "")
page_n <- read_html(x = myurl)
#开始抓取
title_n <- html_nodes(x = page_n,css = tag_title)%>% html_text()
author_n<- html_nodes(x = page_n, css = tag_author)%>% html_text()
institution_n<- html_nodes(x = page_n, xpath = tag_institution)%>% html_text()
abstract_n<- html_nodes(x = page_n, css = tag_abstract)%>% html_text()
#摘要
abstract_n <- gsub("\r", "", abstract_n)
abstract_n <- gsub("\n", "", abstract_n)
abstract_n <- gsub("\t", "", abstract_n)
#关键词
keywords_n<-abstract_n[5]#关键词处于爬取摘要信息的第五行
#作者单位
institution_n<- institution_n[24]
# 合并向量为数据框
paper_work_n <- data.frame(title = title_n,
author = author_n,
institution = institution_n,
keywords = keywords_n,
stringsAsFactors = FALSE)
psysin_2018_1 <- rbind(psysin_2018_1, paper_work_n) # 添加到psysin2018内
}
3.数据清理与处理
3.1作者的数据清理与处理
author_2018<-gsub("[1-9]", "",psysin_2018_1[,2])#去除作者序号
3.2单位的数据清理
#去除单位序号
institution_2018<-gsub("[1-9]", "",psysin_2018_1[,3])
#去除单位的首个符号
for (i in 1:length(institution_2018)) {
if(str_sub(institution_2018[i], start = 1, end = 2)==". "){
institution_2018[i]=str_sub(institution_2018[i], start = 2, end = -1)
}else{
next
}
}
#合并多余的字符
a<-c("华中师范大学","北京师范大学","华东师范大学","西南大学","华南师范大学","北京大学","中国科学院","江西师范大学",
"天津师范大学","山东师范大学","浙江师范大学","福建师范大学","陕西师范大学","西北师范大学","辽宁师范大学","南京师范大学",
"中国人民大学","西北师范大学","东北师范大学","浙江大学","北京大学","暨南大学")
for (i in a) {
institution_2018[grepl(i,institution_2018,fixed = T)==TRUE] <- i
}
###只保留第一个单位
write.csv(institution_2018,"D:/future_plan/Rcourse/psysin2018/institution.csv")#只能手动了
institution_2018<-read.csv("D:/future_plan/Rcourse/psysin2018/institution.csv",header = T)
3.3关键词数据清理
#去除关键词空格
keyword_2018<-gsub("<U+00A0>","",psysin_2018_1[,4])
#去除“关键词”:几个字
keyword_2018<-gsub("关键词","",keyword_2018)
4.制作词云
4.1制作作者词云
#按逗号分成不同的列
split_author<-str_split(author_2018,",")
#将去除逗号的list转成vector
author1 = as.vector(unlist(split_author))
#计算作者频率
author_freq <- freq(author1)
#制作词云
wordcloud2(author_freq)
#设置保存图像的目录
setwd("D:/future_plan/Rcourse/psysin2018")
#设置保存图像的名字,背景颜色,宽度和高度
png(file="author.png", bg="white",width = 480, height = 480)
4.2 制作作者词云
#读取作者
institution1 = as.vector(unlist(institution_2018[2]))
#计算作者频率
institution_freq <- freq(institution1)
#制作词云
wordcloud2(institution_freq)
#设置保存图像的目录
setwd("D:/future_plan/Rcourse/psysin2018")
#设置保存图像的名字,背景颜色,宽度和高度
png(file="insititution.png", bg="white",width = 480, height = 480)
4.3制作关键词词云
#按逗号分成不同的列
split_kw<-str_split(keyword_2018,",")
#将去除逗号的list转成vector
kw1 = as.vector(unlist(split_kw))
#计算关键词频率
kw_freq <- freq(kw1)
#制作词云
wordcloud2(kw_freq)
#设置保存图像的目录
setwd("D:/future_plan/Rcourse/psysin2018")
#设置保存图像的名字,背景颜色,宽度和高度
png(file="kw.png", bg="white",width = 480, height = 480)
5.写出中文数据
#Sys.setlocale(,"Chinese")
#write.csv(institution_2018,"D:/future_plan/Rcourse/psysin2018/institution1.csv")