首页 > 其他 > 详细

LouGou小爬虫

时间:2020-01-22 00:05:51      阅读:136      评论:0      收藏:0      [点我收藏+]
技术分享图片
 1 # _author:   Jolly
 2 # date:  2019/9/8
 3 
 4 from selenium import webdriver
 5 from selenium.webdriver.support.ui import WebDriverWait
 6 from selenium.webdriver.support import expected_conditions as EC
 7 from selenium.webdriver.common.by import By
 8 from lxml import etree
 9 import time
10 import csv
11 
12 class Lagouspider(object):
13     def __init__(self, writer_info):
14         self.writer_info = writer_info
15         self.list_page_url = https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=
16         self.driver_path = rD:\xuexiruanjian\chromedriver\chromedriver.exe
17         self.driver = webdriver.Chrome(executable_path=self.driver_path)
18 
19     def run(self):                             # 主要是的到list页面的源码,以及跳转到下一个list页面
20         self.driver.get(self.list_page_url)              # 打开第一页的页面
21         while True:
22             list_page_source = self.driver.page_source       # 获取list页面的源码
23             self.parse_list_page(list_page_source)           # 执行parse_list_page函数
24             span_tag = self.driver.find_element_by_xpath(//div[@class="pager_container"]//span[@class="pager_next "])                          # 获取"下一页"的html标签
25             if "pager_next pager_next_disabled" in span_tag.get_attribute(class):
26                 break
27             else:
28                 span_tag.click()                              # 点击跳转到下一list页面
29             time.sleep(2)
30             print(==========下一页========, end=\n\n)
31 
32 
33     def parse_list_page(self, list_page_source):                    # 主要是得到详情页面的url
34         htmlelement = etree.HTML(list_page_source)
35         links = htmlelement.xpath(//a[@class="position_link"]/@href)   # 得到一个页面中所有的详情页的url
36         for link in links:
37             self.request_detail_page(link)                    # 对于每个详情页的URL执行request_detail_page函数
38             time.sleep(1)
39             # break
40 
41     def request_detail_page(self, link):             # 主要是打开新的职位页面并将driver切换到该职位页面
42         self.driver.execute_script("window.open(‘%s‘)" % link)         # 执行js语法
43         self.driver.switch_to.window(self.driver.window_handles[1])    # 将driver切换到该职位页面
44         detail_page_source = self.driver.page_source                   # 获取driver所在页面源码
45         self.parse_detail_page(detail_page_source)                     # 调用parse_detail_page函数并传入参数
46         WebDriverWait(self.driver, 10).until(                          # 显示等待
47             EC.presence_of_element_located((By.XPATH, //div[@class="job-detail"]))
48         )
49         time.sleep(1)
50         self.driver.close()                                            # 关闭driver所在页面
51         self.driver.switch_to.window(self.driver.window_handles[0])    # 将driver切换到与原来的列表页面
52 
53     def parse_detail_page(self, detail_page_source):                # 主要是得到职位信息并保存
54         htmlelement = etree.HTML(detail_page_source)
55         title = "".join(htmlelement.xpath(//div[@class="ceil-left"]/span[@class="ceil-job"]/text())).strip()
56         content = "".join(htmlelement.xpath(//div[@class="job-detail"]//text())).strip()
57         self.writer_info.writerow((title, content))                    # 保存成csv文件
58         print(title, content, sep="\n")
59         print(-----*10)
60 
61     # def save_document(self, title, content):
62 
63 
64 def main():
65     fp = open(lagouinfo.csv, a, encoding=utf-8)
66     writer_info = csv.writer(fp)
67     writer_info.writerow((position, introduce))                  # 写入csv文件头部信息
68     spider = Lagouspider(writer_info)
69     spider.run()
70 
71 if __name__ == __main__:
72     main()
View Code

LouGou小爬虫

原文:https://www.cnblogs.com/Jolly-hu/p/12227332.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!