1 # _author: Jolly 2 # date: 2019/9/8 3 4 from selenium import webdriver 5 from selenium.webdriver.support.ui import WebDriverWait 6 from selenium.webdriver.support import expected_conditions as EC 7 from selenium.webdriver.common.by import By 8 from lxml import etree 9 import time 10 import csv 11 12 class Lagouspider(object): 13 def __init__(self, writer_info): 14 self.writer_info = writer_info 15 self.list_page_url = ‘https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=‘ 16 self.driver_path = r‘D:\xuexiruanjian\chromedriver\chromedriver.exe‘ 17 self.driver = webdriver.Chrome(executable_path=self.driver_path) 18 19 def run(self): # 主要是的到list页面的源码,以及跳转到下一个list页面 20 self.driver.get(self.list_page_url) # 打开第一页的页面 21 while True: 22 list_page_source = self.driver.page_source # 获取list页面的源码 23 self.parse_list_page(list_page_source) # 执行parse_list_page函数 24 span_tag = self.driver.find_element_by_xpath(‘//div[@class="pager_container"]//span[@class="pager_next "]‘) # 获取"下一页"的html标签 25 if "pager_next pager_next_disabled" in span_tag.get_attribute(‘class‘): 26 break 27 else: 28 span_tag.click() # 点击跳转到下一list页面 29 time.sleep(2) 30 print(‘==========下一页========‘, end=‘\n\n‘) 31 32 33 def parse_list_page(self, list_page_source): # 主要是得到详情页面的url 34 htmlelement = etree.HTML(list_page_source) 35 links = htmlelement.xpath(‘//a[@class="position_link"]/@href‘) # 得到一个页面中所有的详情页的url 36 for link in links: 37 self.request_detail_page(link) # 对于每个详情页的URL执行request_detail_page函数 38 time.sleep(1) 39 # break 40 41 def request_detail_page(self, link): # 主要是打开新的职位页面并将driver切换到该职位页面 42 self.driver.execute_script("window.open(‘%s‘)" % link) # 执行js语法 43 self.driver.switch_to.window(self.driver.window_handles[1]) # 将driver切换到该职位页面 44 detail_page_source = self.driver.page_source # 获取driver所在页面源码 45 self.parse_detail_page(detail_page_source) # 调用parse_detail_page函数并传入参数 46 WebDriverWait(self.driver, 10).until( # 显示等待 47 EC.presence_of_element_located((By.XPATH, ‘//div[@class="job-detail"]‘)) 48 ) 49 time.sleep(1) 50 self.driver.close() # 关闭driver所在页面 51 self.driver.switch_to.window(self.driver.window_handles[0]) # 将driver切换到与原来的列表页面 52 53 def parse_detail_page(self, detail_page_source): # 主要是得到职位信息并保存 54 htmlelement = etree.HTML(detail_page_source) 55 title = "".join(htmlelement.xpath(‘//div[@class="ceil-left"]/span[@class="ceil-job"]/text()‘)).strip() 56 content = "".join(htmlelement.xpath(‘//div[@class="job-detail"]//text()‘)).strip() 57 self.writer_info.writerow((title, content)) # 保存成csv文件 58 print(title, content, sep="\n") 59 print(‘-----‘*10) 60 61 # def save_document(self, title, content): 62 63 64 def main(): 65 fp = open(‘lagouinfo.csv‘, ‘a‘, encoding=‘utf-8‘) 66 writer_info = csv.writer(fp) 67 writer_info.writerow((‘position‘, ‘introduce‘)) # 写入csv文件头部信息 68 spider = Lagouspider(writer_info) 69 spider.run() 70 71 if __name__ == ‘__main__‘: 72 main()
原文:https://www.cnblogs.com/Jolly-hu/p/12227332.html