数据挖掘---R语言爬虫(基于hardly的rvest包)

时间：2017-04-01 17:21:48 阅读：219 评论：0 收藏：0 [点我收藏+]

library(rvest)
library(stringr)

getdata<-function(page,urlwithoutpage){
#读取数据，规定编码
web<-read_html(paste0(urlwithoutpage,page),encoding="GBK")
#获取书名这些符号（：（(—）后面的统统丢掉
titie_all <- web %>% html_nodes("div ul p.name") %>% html_text()
title <- sapply(strsplit(titie_all,split = "[ ：（(—―]"),"[",2)
#获取价格
price <-web %>% html_nodes("div ul span.search_now_price") %>% html_text()
#获取出版信息
search_book_author <- web %>% html_nodes("p.search_book_author") %>% html_text()
#为避免与CSV的逗号混淆，字段里的逗号统一替换成中文逗号
search_book_author <-gsub(pattern = ",", replacement = "，", search_book_author)
author <- sapply(strsplit(search_book_author,"/"),"[",1)
#利用正则表达式提取字符串
publication_date = str_extract(search_book_author,"\\d{4}-\\d{2}-\\d{2}")
publishing_house = str_extract(search_book_author,"\\w*出版社\\w*")
#整合成数据框返回
data.frame(title,price,author,publication_date,publishing_house)
}

#当当网上检索R语言得到的URL
dangdang <- "http://search.dangdang.com/?key=r%D3%EF%D1%D4&act=input&ddt-rpm=undefined&page_index="
#爬取其中的三页信息
final<-data.frame()
for (i in 1:2){
final<-rbind(final,getdata(i,dangdang))
}
write.table(final, ‘dangdang.csv‘, sep = ",",row.names = FALSE)

原文：http://www.cnblogs.com/heisaijuzhen/p/6656439.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)