Chapter 13 其他-数据抓取

  • 古诗词 / 去哪儿网 / 前程无忧 / 智联招聘 网页数据抓取尝试
# 去哪儿网数据抓取
rm(list = ls())
require(magrittr)
require(stringr)
require(lubridate)
require(rvest)
require(curl)
require(jsonlite)

cities <- read.csv("c:/train_interval_price.csv",header=TRUE,stringsAsFactors = FALSE)
cities <- cities[, 1]

interval_df <- as.data.frame(matrix(NA, length(cities), length(cities)))
colnames(interval_df) <- rownames(interval_df) <- cities
minprice_df <- interval_df
miles_df <- interval_df
tickettype <- interval_df
duration <- interval_df

url_1 <- "https://train.qunar.com/dict/open/s2s.do?callback=jQuery"
url_2 <- "&dptStation="
url_3 <- "&arrStation="
url_4 <- "&date=2017-10-04&type=normal&user=neibu&source=site&start=1&num=500&sort=3"

for (i in seq(nrow(interval_df))) { 
  
  for (j in seq(ncol(interval_df))) {
    
    if (i == j) {
      
      duration[i, j] <- tickettype[i,j] <- 
        miles_df[i, j] <- minprice_df[i, j] <- 
        interval_df[i, j] <- 0
      
    } else if(i < j) {
      
      #i <- 113; j <- 117
      randnum_url <- runif(1, 1, 20) %>% floor()
      url <- paste0(url_1, randnum_url, url_2, cities[i], url_3, cities[j], url_4)
      
      #guess_encoding(url) #ERROR:Timeout was reached
      cat(paste("抓取网页", url, "...\n"))
      ok <- FALSE
      counter <- 0
      while (ok == FALSE & counter <= 5) {
        counter <- counter + 1
        text_source <- tryCatch({
          #双重保障,curl(url, handle = curl::new_handle("useragent" = "Mozilla/5.0"))
          read_html(curl(url, handle = curl::new_handle("useragent" = "Mozilla/5.0")), 
                    ncoding = "UTF-8") %>% html_text()
        },
        error = function(e) {
          Sys.sleep(2)
          e
        }
        )
        if ("error" %in% class(text_source)) {
          cat("error happen")
        } else {
          ok <- TRUE
          cat("Done.")
        }
      }
      
      big_left <- str_locate_all(text_source, "\\{")
      big_right <- str_locate_all(text_source,"\\}")
      subs <- str_sub(text_source, big_left[[1]][1, 1], 
                      big_right[[1]][nrow(big_right[[1]]), 1])
      #validate(subs)
      #class(subs)
      
      sub_json <- fromJSON(subs)
      #str(sub_json)
      
      need_df <- sub_json[["data"]][["s2sBeanList"]][["extraBeanMap"]]
      interval <- sub_json[["data"]][["s2sBeanList"]][["extraBeanMap"]][["interval"]] 
      
      len_inte <- length(interval)
      if (len_inte > 0) {
        
        tf <- tempfile()
        write.table(interval, tf, quote=F,row.names = F)
        interval <- read.csv(tf, stringsAsFactors = F)
        #class(interval)
        vec <- interval[, 1]
        #d / H / M 后面不要加空格
        need_df[["duration"]] <- str_replace_all(vec, c("天" = "d", "小时" = "H", "分" = "M")) 

        idx_min_dura <- duration(need_df[["duration"]]) %>% as.numeric() %>% which.min()
        
        interval_df[i, j] <- need_df[idx_min_dura, "interval"]
        miles_df[i, j] <- need_df[idx_min_dura, "intervalMiles"]
        duration[i, j] <- duration(need_df[idx_min_dura, "duration"]) %>% as.numeric()
        
        ticketType <- str_split(need_df[idx_min_dura, "ticketType"], ",")
        
        seats <- sub_json[["data"]][["s2sBeanList"]][["seats"]]
        if (grepl("二等座", ticketType)) {
          
          minprice_df[i, j] <- seats[["二等座"]][idx_min_dura, "price"]
          tickettype[i, j] <- "二等座"
          
        } else if (grepl("无座", ticketType)) {
          
          minprice_df[i, j] <- seats[["无座"]][idx_min_dura, "price"]
          tickettype[i, j] <- "无座"
          
        } else if (grepl("硬座", ticketType)) {
          
          minprice_df[i, j] <- seats[["硬座"]][idx_min_dura, "price"]
          tickettype[i, j] <- "硬座"
          
        } else if (grepl("软座", ticketType)) {
          
          minprice_df[i, j] <- seats[["软座"]][idx_min_dura, "price"]
          tickettype[i, j] <- "软座"
          
        } else if (grepl("一等座", ticketType)) {
          
          minprice_df[i, j] <- seats[["一等座"]][idx_min_dura, "price"]
          tickettype[i, j] <- "一等座"
          
        } else if (grepl("商务座", ticketType)) {
          
          minprice_df[i, j] <- seats[["商务座"]][idx_min_dura, "price"]
          tickettype[i, j] <- "商务座"
          
        } else if (grepl("特等座", ticketType)) {
          
          minprice_df[i, j] <- seats[["特等座"]][idx_min_dura, "price"]
          tickettype[i, j] <- "特等座"
          
        } else if (grepl("硬卧", ticketType)) {
          
          minprice_df[i, j] <- seats[["硬卧"]][idx_min_dura, "price"]
          tickettype[i, j] <- "硬卧"
          
        } else if (grepl("软卧", ticketType)) {
          
          minprice_df[i, j] <- seats[["软卧"]][idx_min_dura, "price"]
          tickettype[i, j] <- "软卧"
          
        } else if (grepl("高级软卧", ticketType)) {
          
          minprice_df[i, j] <- seats[["高级软卧"]][idx_min_dura, "price"]
          tickettype[i, j] <- "高级软卧"
          
        } else {
          
          minprice_df[i, j] <- seats[["一人软包"]][idx_min_dura, "price"]
          tickettype[i, j] <- "一人软包"
          
        }
        
        cat(paste0(cities[i], "至", cities[j], "的行程数据抓取完毕!\n"))
        
        sleeptime <- runif(1, 0, 1) %>% round(., 1)
        Sys.sleep(sleeptime)
        
        cat(paste0("爬虫休息", sleeptime, "秒\n"))
        
      } else {
        
        cat(paste0(cities[i], "至", cities[j], "的直接行程不存在!\n"))
        next
      }
      
    } else {
      
      next
    }
    
  }
  
}


head(interval_df)
head(minprice_df)
head(miles_df)
head(tickettype)
head(duration)

all.need <- list(interval_df=interval_df,
                 minprice_df=minprice_df,
                 miles_df=miles_df,
                 tickettype=tickettype,
                 duration=duration)
writexl::write_xlsx(all.need, "c:/qunae.xlsx")

#######
t_interval <- t(interval_df)
t_minprice <- t(minprice_df)
t_miles <- t(miles_df)
t_tickettype <- t(tickettype)
t_duration <- t(duration)


for (i in seq(nrow(interval_df))) {
  for (j in seq(ncol(interval_df))) {
    if (i >= j) {
      interval_df[i, j] <- t_interval[i, j]
      minprice_df[i, j] <- t_minprice[i, j]
      miles_df[i, j] <- t_miles[i, j]
      tickettype[i, j] <- t_tickettype[i, j]
      duration[i, j] <- t_duration[i, j]
    }
  }
}

head(interval_df)
head(minprice_df)
head(miles_df)
head(tickettype)
head(duration)

all.need <- list(interval_df=interval_df, 
                 minprice_df=minprice_df,
                 miles_df=miles_df,
                 tickettype=tickettype,
                 duration=duration)
writexl::write_xlsx(all.need, "c:/qunae_final.xlsx")
# 前程无忧数据抓取
require(rvest)
require(XML)
require(RCurl)

homepage <- "http://www.51job.com/shenzhen"
homepage <- getURLContent(homepage)
homepage <- read_html(homepage)


hyperlink_first <- homepage %>% 
  html_nodes(xpath = "//*[@class = 'e e1']/a") %>% 
  html_attr("href")

#hyperlink_first <- hyperlink_first[27:117]
#getURLContent(hyperlink_first[1])
#class(getURLContent(hyperlink_first[1]))

for (i in seq(length(hyperlink_first))) {
  
  #i <- 1
  cat("hyperlink_first_i = ", hyperlink_first[i])
  cat("\n")
  
  for (kk in 1:25) { #每一个小类别25页
    
    cat("第", kk, "页 Begin!")
    cat("\n")
    
    #kk <- 1
    page_2 <- getURLContent(paste0(hyperlink_first[i],"p",kk), encoding = "UFT-8")
    
    needed <- data.frame(matrix(NA, 20, 6))
    
    colnames(needed) <- c("posit", "comp", "addre",
                          "salar", "comdetail", "posdetail")
    
    page_2 <- read_html(page_2, encoding = "UTF-8")
    
    try <- page_2 %>%
      html_nodes(
        xpath = paste0('/html/body/div[3]/div[2]/div[1]/div/div[',1,']/p[1]/span[1]/a')) %>% 
      xml_text(trim = TRUE)
    
    if (length(try) != 0) {
      
      for (j in 1:20) {  #每一页有20个岗位
        
        page_2_posit <- page_2 %>%
          html_nodes(
            xpath = paste0('/html/body/div[3]/div[2]/div[1]/div/div[',j,']/p[1]/span[1]/a')) %>% 
          xml_text(trim = TRUE)
        
        page_2_comp <- page_2 %>%
          html_nodes(
            xpath = paste0('/html/body/div[3]/div[2]/div[1]/div/div[',j,']/p[1]/a')) %>% 
          xml_text(trim = TRUE)
        
        page_2_addre <- page_2 %>%
          html_nodes(
            xpath = paste0('/html/body/div[3]/div[2]/div[1]/div/div[',j,']/p[1]/span[2]')) %>%
          xml_text(trim = TRUE)
        
        page_2_salar <- page_2 %>%
          html_nodes(
            xpath = paste0('/html/body/div[3]/div[2]/div[1]/div/div[',j,']/p[1]/span[3]')) %>%
          xml_text(trim = TRUE)
    
        page_2_comdetail <- page_2 %>%
          html_nodes(
            xpath = paste0('/html/body/div[3]/div[2]/div[1]/div/div[',j,']/p[2]')) %>%
          xml_text(trim = TRUE)
        
        page_2_posdetail <- page_2 %>%
          html_nodes(
            xpath = paste0('/html/body/div[3]/div[2]/div[1]/div/div[',j,']/p[3]')) %>%
          xml_text(trim = TRUE)
        
        needed[j, 1] <- ifelse(length(page_2_posit) > 0,     
                               page_2_posit %>% gsub(",",",", .), "")
        needed[j, 2] <- ifelse(length(page_2_comp) > 0,      
                               page_2_comp %>% gsub(",",",", .), "")
        needed[j, 3] <- ifelse(length(page_2_addre) > 0,     
                               page_2_addre %>% gsub(",",",", .), "")
        needed[j, 4] <- ifelse(length(page_2_salar) > 0,     
                               page_2_salar %>% gsub(",",",", .), "")
        needed[j, 5] <- ifelse(length(page_2_comdetail) > 0, 
                               page_2_comdetail %>% gsub(",",",", .), "")
        needed[j, 6] <- ifelse(length(page_2_posdetail) > 0, 
                               page_2_posdetail %>% gsub(",",",", .), "")
        
      }
      
      write.table(needed, file = "c:/need_qcwu_3.csv", 
                  row.names = FALSE, col.names = FALSE,
                  sep = "###", append = TRUE)
      
      Sys.sleep(3)
      
    } else {next}
    
  }
  
  Sys.sleep(3)
}
#智联招聘数据抓取
#设置报头
myheader <- c(
  "User-Agent" = "Mozilla/5.0(Windows;U;Windows NT 5.1;zh-CN;rv:1.9.1.6",
  "Accept" = "text/htmal,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language" = "en-us",
  "Connection" = "keep-alive",
  "Accept-Charset" = "GB2312,utf-8;q=0.7,*;q=0.7"
)

##加载需要的packages
require(XML)
require(rvest)
require(RCurl)
require(stringr)

getLinks <- function() { 
  links = character() 
  list(a = function(node, ...) { 
    links <<- c(links, xmlGetAttr(node, "href"))
    node 
  }, 
  links = function()links)
} 
 
h1 <- getLinks()

url_industry <- "http://company.zhaopin.com/shenzhen/"
url_industry <- getURL(url_industry, httpheader = myheader, encoding = "UTF-8")

htmlTreeParse(url_industry,encoding = "UTF-8", asText = TRUE, handlers = h1)

for (industry in 75:186) {
  
  # industry <- 76

  indu_code <- h1$links()[industry]     #75-186
  url_comp_1 <- paste0("http://company.zhaopin.com",indu_code,"p")
  for (k in 1:7) { #大约7*30=210家公司每行业
    # k <- 1
    url_comp <- paste0(url_comp_1,k,"/")
    
    ok <- FALSE
    counter <- 0
    while (ok == FALSE & counter <= 5) {
      counter <- counter + 1
      uc_comp <- tryCatch({         
        htmlParse(url_comp,encoding = "UTF-8")
      },
      error = function(e) {
        Sys.sleep(2)
        e
      }
      )
      if ("error" %in% class(uc_comp)) {
        cat("error happen")
      } else {
        ok <- TRUE
        cat("uc_comp_Done.")
      }
    }

    #uc_comp <- htmlParse(url_comp,encoding = "UTF-8")
    xpath_comp <- "//*[@class = 'fleft checkjobs width280']/a"
    url.node_compa <- getNodeSet(uc_comp,xpath_comp)
    #length(url.node_compa)  #[1] 35
    #url.ineed_compa <- xmlGetAttr(url.node_compa[[1]],'href') 
    #[1] "http://company.zhaopin.com/CC672819926.htm"
    
    position_df_rb <- NULL
    
    for (j in 1:length(url.node_compa)) {
      #j <- 3
       if (!is.null(url.node_compa[[j]])) {
         url.ineed_compa <- xmlGetAttr(url.node_compa[[j]],'href')
         ok <- FALSE
         counter <- 0
         while (ok == FALSE & counter <= 5) {
           counter <- counter + 1
           url.ineed_compa <- tryCatch({         
             getURL(url.ineed_compa,httpheader = myheader, encoding = "UTF-8")
           },
           error = function(e) {
             Sys.sleep(2)
             e
           }
           )
           if ("error" %in% class(url.ineed_compa)) {
             cat("error happen")
           } else {
             ok <- TRUE
             cat("url.ineed_compa_Done.")
           }
         }
         
         # url.ineed_compa <- getURL(url.ineed_compa,httpheader=myheader,encoding="UTF-8")
         
         url.ineed_compa2 <- htmlParse(url.ineed_compa,encoding = "UTF-8") 
         #guess_encoding(url.ineed_compa1)
         
         #url.ineed_compa2
         xpath <- "//*[@class = 'positionListContent1']/span/a"
         url.node <- getNodeSet(url.ineed_compa2,xpath)
         
         #length(url.node)#[1] 2
         #url.ineed_1 <- xmlGetAttr(url.node[[1]],'href')
         #url.ineed_2 <- xmlGetAttr(url.node[[2]],'href')
         
         
         position_df <- as.data.frame(matrix(NA,length(url.node),13))
         colnames(position_df) <- c("industry","company_name","position_name",
                                    "position_category","salary","location","date",
                                    "job_year","number_need","job_property","degree",
                                    "address","job_requirement_respon")
         
         for (i in 1:length(url.node)) {
           ### i <- 1
           
           if (!is.null(url.node[[i]])) {
             u <- xmlGetAttr(url.node[[i]], 'href')
             ok <- FALSE
             counter <- 0
             while (ok == FALSE & counter <= 5) {
               counter <- counter + 1
               cha_uc <- tryCatch({         
                 getURL(u, httpheader = myheader, encoding="UTF-8")
               },
               error = function(e) {
                 Sys.sleep(2)
                 e
               }
               )
               if ("error" %in% class(url.ineed_compa)) {
                 cat("error happen")
               } else {
                 ok <- TRUE
                 cat("cha_uc_Done.")
               }
             }
             
             #cha_uc <- getURL(u,httpheader=myheader,encoding="UTF-8")
             
             download.file(u, destfile = "u_scrapedpage.html", quiet = TRUE)
             
             uc <- read_html("u_scrapedpage.html")
             # uc
             # class(uc)
             
             company_name <- uc %>% html_nodes(".fl a") %>% html_text()
             
             position_name <- uc %>% html_nodes("h1") %>% html_text()
             
             position_category <- uc %>% html_nodes("li:nth-child(8) strong") %>% html_text()
             #position_category[1]
             
             salary <- uc %>% html_nodes(".terminalpage-left li:nth-child(1) strong") %>% html_text()
             #salary[1]
             
             location <- uc %>% html_nodes(".terminal-ul li:nth-child(2) a") %>% html_text()
             
             date <- uc %>% html_nodes("#span4freshdate") %>% html_text()
             
             job_year <- uc %>% html_nodes(".terminalpage-left li:nth-child(5) strong") %>% html_text()
             #job_year[1]
             
             number_need <- uc %>% html_nodes("li:nth-child(7) strong") %>% html_text()
             #number_need[1]
             
             job_property <- uc %>% html_nodes(".terminalpage-left li:nth-child(4) strong") %>% html_text()
             #job_property[1]
             
             degree <- uc %>% html_nodes("li:nth-child(6) strong") %>% html_text()
             #degree[1]
             
             address <- uc %>% html_nodes(".tab-inner-cont h2") %>% html_text()
             address <- address %>% str_trim() %>% str_split("\r")
             #address[1]
             
             from <- str_locate_all(cha_uc,pattern = "<!-- SWSStringCutStart -->")
             to <- str_locate_all(cha_uc,pattern =  "<!-- SWSStringCutEnd -->")
             #from[[1]][1,"end"]
             
             job_requirement_respon <- str_sub(cha_uc,start = from[[1]][1,"end"] + 1,end = to[[1]][1,"start"] - 1)
             job_requirement_respon <- job_requirement_respon %>% str_trim(side = "both")
             # 注意 str_replace_all()与str_replace()的区别
             job_requirement_respon <- str_replace_all(job_requirement_respon,"\n","")
             job_requirement_respon <- str_replace_all(job_requirement_respon,",","")  
             
             
             position_df[i,"industry"] <- indu_code
             position_df[i,"company_name"] <- ifelse(length(company_name) > 0,
                                                     company_name,"")
             position_df[i,"position_name"] <- ifelse(length(position_name) > 0,
                                                      position_name,"")
             position_df[i,"position_category"] <- ifelse(length(position_category[1]) > 0,
                                                          position_category[1],"")
             position_df[i,"salary"] <- ifelse(length(salary[1]) > 0,
                                               salary[1],"")
             position_df[i,"location"] <- ifelse(length(location) > 0,
                                                 location,"")
             position_df[i,"date"] <- ifelse(length(date) > 0,date,"")
             position_df[i,"job_year"] <- ifelse(length(job_year[1]) > 0,
                                                 job_year[1],"")
             position_df[i,"number_need"] <- ifelse(length(number_need[1]) > 0,
                                                    number_need[1],"")
             position_df[i,"job_property"] <- ifelse(length(job_property[1]) > 0,
                                                     job_property[1],"")
             position_df[i,"degree"] <- ifelse(length(degree[1]) > 0,
                                               degree[1],"")
             position_df[i,"address"] <- ifelse(length(address[[1]][1]) > 0,
                                                address[[1]][1],"")
             position_df[i,"job_requirement_respon"] <- ifelse(length(job_requirement_respon) > 0,
                                                               job_requirement_respon,"")
             
             print(paste0("i=",i,"完成"))
           }
         }
         
         position_df_rb <- rbind(position_df,position_df_rb)
       }
      
      print(paste0("j=",j,"完成"))
      Sys.sleep(1.8)
    }
    print(paste0("第",k,"页的公司完成"))
  }
  write.table(position_df_rb,"c:/position_df_rb.csv",
              sep = "||",row.names = FALSE,append = TRUE)
  write.table("\n","c:/position_df_rb.csv",sep = "||",
              row.names = FALSE,append = TRUE)
  print(paste0("行业",indu_code,"完成"))
}
# 古诗词数据抓取
# https://so.gushiwen.org/authors/authorvsw_727e9dff8850A2.aspx
# https://so.gushiwen.org/authors/authorvsw_727e9dff8850A3.aspx

library(rvest)
library(stringr)
library(XML)

url_1 <- 'https://so.gushiwen.org/authors/authorvsw_e3c4e8cf2646A'
url_2 <- '.aspx'
page <- 1:55

for (k in 1:length(page)) {
  # k <- 1
  url <- str_c(url_1, page[k] ,url_2)
  cat("抓取网页: ", url, "\n...", page[k], "...\n")
  ok <- FALSE
  counter <- 0
  while (ok == FALSE & counter <= 5) {
    counter <- counter + 1
    text_source <- tryCatch({
      urlContent <- getURLContent(url)
      read_html(urlContent)
    },
    error = function(e) {
      Sys.sleep(2)
      e
    })
    
    if ("error" %in% class(text_source)) {
      cat("错误发生\n")
    } else {
      ok <- TRUE
      cat("Done.\n")
    }
  }
  
  title <- text_source %>% html_nodes(
    xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "sons", " " ))]//b'
  ) %>% xml_text(trim = TRUE) %>% 
    str_replace_all(",", ",")   #csv文件分列
  
  dynasty <- text_source %>% html_nodes(
    xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "source", " " ))]//a[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]'
  ) %>% xml_text(trim = TRUE) %>% 
    str_replace_all(",", ",")
  
  who <- text_source %>% html_nodes(
    css = '.source span+ a'
  ) %>% xml_text(trim = TRUE) %>% 
    str_replace_all(",", ",")
  
  content <- text_source %>% html_nodes(
    css = '.contson'
  ) %>% xml_text(trim = TRUE) %>% 
    str_replace_all("\\n", "") %>% #去掉换行符
    str_replace_all(",", ",")
  
  
  dfNrows <- list(title, dynasty,
                  who, content) %>% 
    map(length) %>% 
    unlist() %>% 
    max()
  
  df <- matrix(NA, dfNrows, 4) %>% 
    as.data.frame()
  
  colnames(df) <- c("title", "dynasty", 'who', "content")
  
  for (i in 1:4) {
    #i <- 1
    id <- colnames(df)[i]
    idIsNotChar <- get(id, envir = .GlobalEnv)  ##
    if (length(idIsNotChar) != 0 &&
        length(idIsNotChar) == dfNrows) {
      df[, id] <- idIsNotChar
    } else if (length(idIsNotChar) != 0 &&
               length(idIsNotChar) < dfNrows) {
      df[, id] <- c(idIsNotChar, rep(NA, dfNrows - length(idIsNotChar)))
    } else {
      df[, id] <- NA
    }
  }
  
  write.table(df, file = "E:/kaggleData/LiuYuXi.csv",
              row.names = FALSE, col.names = FALSE,
              quote = FALSE, append = TRUE, sep = "##")
}