首页 > 编程语言 > 详细

python3+selenium模拟浏览器采集数据

时间:2020-11-08 17:03:03      阅读:37      评论:0      收藏:0      [点我收藏+]

原来用的 go + chromedp 采集pdd商家后台订单信息,结果登录页面的时候说环境异常不给扫码

采集web端的时候验证登录莫名不能发送短信验证码,遇到安全验证弹不出对话框,应该是哪个地方没配置好,没头绪。换python+selenium试试

安装使用教程系列:https://blog.csdn.net/u011541946/category_6788788_1.html

 

于是python也遇到了同样的问题,想直接访问api被anti_content加密难倒。browsermob捕获network粗略的试了下没成功,这个还需要java环境。

那换个思路吧。让selenium接管一个已经打开了的页面

先将chrome的路径放在path下,cmd运行

chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenum\AutomationProfile"

此时会打开一个chrome浏览器,试运行

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
 
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
chrome_driver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options)
print(driver.title)

如果能打印出该tab下的网页title就成功接管了。

 

于是便开始了漫长的面向过程单线程之旅:

import time
import re
import xlwt
import math
import win32api,win32con
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait       #WebDriverWait注意大小写
from selenium.webdriver.common.by import By

def before():
    chrome_options = Options()
    chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
    chrome_driver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
    driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options)
    return driver

def get_msg(driver):
    #点击"查看手机号"
    WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[1]/div[2]/div/a/span)))
    driver.find_element_by_xpath(//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[1]/div[2]/div/a/span).click()
    #此处停一秒以免 Ajax 没加载完
    time.sleep(1.5)
    #获取收件人信息
    name = driver.find_element(By.XPATH, //*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[1]/div[2]/div).text
    #判断是否验证
    if "*******" in name:
        win32api.MessageBox(0, "请先通过验证再关闭此对话框", "收件人",win32con.MB_OK)    
    #点击"查看姓名与地址"
    WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[2]/div[2]/div/a/span)))
    driver.find_element_by_xpath(//*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[2]/div[2]/div/a/span).click()
    #此处停一秒以免 Ajax 没加载完
    time.sleep(1.5)
    #获取地址信息
    address = driver.find_element(By.XPATH, //*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[2]/div[2]/div).text
    #判断是否验证
    if "****" in address:
        win32api.MessageBox(0, "请先通过验证再关闭此对话框", "联系地址",win32con.MB_OK)
    #获取收件人姓名电话地址信息
    name = driver.find_element(By.XPATH, //*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[1]/div[2]/div).text
    address = driver.find_element(By.XPATH, //*[@id="mf-mms-orders-container"]/div/div/div[5]/div[2]/div[1]/div[2]/div[2]/div).text
    #整理信息
    msg = name + " " + address
    return msg


#选择50个/页
def choose_50(driver):
    #点击页码
    WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[2]/div/div/div/div/div/div/div/div[1]/input)))
    driver.find_element_by_xpath(//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[2]/div/div/div/div/div/div/div/div[1]/input).click()
    #选择50
    WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,/html/body/div[6]/div/div/div/div/ul/li[4]/span)))
    driver.find_element_by_xpath(/html/body/div[6]/div/div/div/div/ul/li[4]/span).click()

#获取点击下一页的次数
def get_count(driver,page):
    total = int(get_total(driver))
    yu = 0 if total%page == 0 else 1
    page_num = math.floor(total/page) + yu - 1
    return page_num

#从该页获取订单信息并返回
def get_msg_by_orderid(order_id,driver):
    js=window.open("https://mms.pinduoduo.com/orders/detail?type=4399&sn=+ order_id +");
    driver.execute_script(js)
    driver_new = before()
    msg = get_msg(driver_new)
    driver_new.close()
    return msg

#获取该页所有订单编号并返回
def get_orders(driver):
    html = driver.page_source
    pattern = re.compile(r"data-clipboard-text=\"(.*?)\" class=")
    result = pattern.findall(html)
    return result

#获取总订单数并返回
def get_total(driver):
    html = driver.page_source
    str = driver.find_element_by_xpath(//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[1]).text
    return re.findall("\d+", str)[0]
    
#将数据写入excel
def excel_write(di):
    # 创建一个workbook 设置编码
    workbook = xlwt.Workbook(encoding = utf-8)
    # 创建一个worksheet
    worksheet = workbook.add_sheet(订单)

    i = 0
    worksheet.write(0,0, label = 订单编号)
    worksheet.write(0,1, label = 发货信息)
    # 写入excel
    # 参数对应 行, 列, 值
    for order_id in di:
        i = i+1
        worksheet.write(i,0, label = order_id)
        worksheet.write(i,1, label = di[order_id])
    # 保存
    workbook.save(orders.xlsx)

#这一页的操作
def action_thispage(driver):
    orders = get_orders(driver)
    for order_id in orders:
        msg = get_msg_by_orderid(order_id,driver)
        di[order_id] = msg
    

#点击下一页
def nextpage_click(driver):
    WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.XPATH,//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[6]/i)))
    driver.find_element_by_xpath(//*[@id="mf-mms-orders-container"]/div/form/div[3]/div[2]/ul/li[6]/i).click()
    
di = {}

def main():
    driver = before()
   action_thispage(driver) page
= 5 for index in range(get_count(driver,page)): nextpage_click(driver) time.sleep(1) action_thispage(driver) excel_write(di) if __name__ == __main__: main()

可中途遇到的安全验证实在太频繁了,每个验证都需要人脑思考参与,暂时没法做到机器过验证。迫使用户使用官方工具...web采集放弃了

python3+selenium模拟浏览器采集数据

原文:https://www.cnblogs.com/longzhankunlun/p/13943957.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!