首页 > 其他 > 详细

操作excel文件爬取nvd.nist数据

时间:2019-07-09 22:59:22      阅读:122      评论:0      收藏:0      [点我收藏+]
#!/usr/bin/env python
# encoding: utf-8
#@author: jack
import random
from time import sleep
import pandas as pd
from openpyxl import load_workbook
from urllib import request
from lxml import etree

wb = load_workbook(cve.xlsx)#要读取的excel文件名,用openpyxl考虑到wrxl库网友反馈对excel后面版本兼容不是太好
sheet = wb[Sheet1]#默认excel右下角表名称
cve_list =[]
for i in sheet["D"][1:25]:#第D列第一行开始读起到24结束
    cve_code = i.value#读取到的每个列表参数
    cve_list.append(cve_code)#前面定义的空列表来存放excel读取的数据
    start_url = https://nvd.nist.gov/vuln/detail/#老美网站待爬数据,右击页面看到数据是静态的爬起来舒服
    score_li=[]
    vector3_li=[]
    vector2_li=[]
    for url_code in cve_list:#取列表参数
        url = {}{}.format(start_url,url_code)#url拼接
        response = request.urlopen(url)
        result = response.read().decode()
        html = etree.HTML(result)
        v3BaseScore = html.xpath(//span[@data-testid="vuln-cvssv3-base-score"]/text())#etree定位so easy
        Vector3 = html.xpath(//span[@data-testid="vuln-cvssv3-vector"]/text())
        Vector2 = html.xpath(//span[@data-testid="vuln-cvssv2-vector"]/text())
        score_li.append( .join(v3BaseScore))#格式化保存页面提取的数据
        vector3_li.append( .join(Vector3))
        vector2_li.append( .join(Vector2))
    df1 = pd.DataFrame({v3BaseScore: score_li})#构建表头字段pandas方法
    df2 = pd.DataFrame({Vector3: vector3_li})
    df3 = pd.DataFrame({Vector2: vector2_li})
    All = [df1, df2, df3]
    writer = pd.ExcelWriter(test1.xlsx)#新建excel文件,
    df1.to_excel(writer, sheet_name=Sheet1, startcol=1, index=False)#指定列逐行写入数据
    df2.to_excel(writer, sheet_name=Sheet1, startcol=2, index=False)
    df3.to_excel(writer, sheet_name=Sheet1, startcol=3, index=False)
    writer.save()
    writer.close()

业务需求现学pandas和openpyxl,

1、页面分析

2、定位分析

3、数据读写分析

4、网站容易爬挂,并发调低,user-agent代理可以搞起

技术分享图片
import base64
import random
from multiprocessing.pool import ThreadPool
import time
import pandas as pd
from openpyxl import load_workbook
from urllib import request
from lxml import etree
from proxies import *

def task1():
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]


    # count = 0
    header = {}

    header[User-Agent] = random.choice(user_agent_list)
    header.update({
        Host: nvd.nist.gov,
        User-Agent::Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36,
    })

    time.sleep(0.3)
    # read cve excle
    wb = load_workbook(cve.xlsx)
    sheet = wb[Sheet1]
    cve_list = []

    # part request
    for i in sheet["D"][1:]:
        cve_code = i.value
        cve_list.append(cve_code)
        start_url = https://nvd.nist.gov/vuln/detail/

        score_li = []
        vector3_li = []
        vector3_href_li = []
        vector2_li = []
        vector2_href_li = []
        for url_code in cve_list:
            url = {}{}.format(start_url, url_code)
            res = request.Request(url,headers=header)
            response = request.urlopen(res)
            result = response.read().decode()

            # 数据清洗部分
            html = etree.HTML(result)
            score_list = html.xpath(//span[@data-testid="vuln-cvssv3-base-score"]/text())
            vector3_list = html.xpath(//span[@data-testid="vuln-cvssv3-vector"]/text())
            vector2_list = html.xpath(//span[@data-testid="vuln-cvssv2-vector"]/text())
            vector3_href_list = html.xpath(//span[@data-testid="vuln-cvssv3-vector"]//a/@href)
            vector2_href_list = html.xpath(//span[@data-testid="vuln-cvssv2-vector"]//a/@href)
            score_li.append( .join(score_list))
            vector3_li.append( .join(vector3_list))
            vector3_href_li.append( .join(vector3_href_list))
            vector2_li.append( .join(vector2_list))
            vector2_href_li.append( .join(vector2_href_list))

            # create some Pandas DateFrame from some data
            df1 = pd.DataFrame({CVSSv3.0BaseScore: score_li})
            df2 = pd.DataFrame({CVSS v3.0 Vector: vector3_li})
            df3 = pd.DataFrame({CVSS v3.0 Vector link: vector3_href_li})
            df4 = pd.DataFrame({CVSS v2.0 Vector: vector2_li})
            df5 = pd.DataFrame({CVSS v2.0 Vector link: vector2_href_li})
            All = [df1, df2, df3, df4, df5]

            # create a Pandas Excel writer using xlswriter
            writer = pd.ExcelWriter(basescore.xlsx)

            df1.to_excel(writer, sheet_name=Sheet2, startcol=2, index=False)
            df2.to_excel(writer, sheet_name=Sheet2, startcol=3, index=False)
            df3.to_excel(writer, sheet_name=Sheet2, startcol=4, index=False)
            df4.to_excel(writer, sheet_name=Sheet2, startcol=5, index=False)
            df5.to_excel(writer, sheet_name=Sheet2, startcol=6, index=False)

            writer.save()
            writer.close()

def process_request(self, request, spider):
    PROXIES = [
        {ip_port: 61.160.233.8, user_pass: ‘‘},
        {ip_port: 125.93.149.186, user_pass: ‘‘},
        {ip_port: 58.38.86.181, user_pass: ‘‘},
        {ip_port: 119.142.86.110, user_pass: ‘‘},
        {ip_port: 124.161.16.89, user_pass: ‘‘},
        {ip_port: 61.160.233.8, user_pass: ‘‘},
        {ip_port: 101.94.131.237, user_pass: ‘‘},
        {ip_port: 219.157.162.97, user_pass: ‘‘},
        {ip_port: 61.152.89.18, user_pass: ‘‘},
        {ip_port: 139.224.132.192, user_pass: ‘‘}
    ]
    proxy = random.choice(PROXIES)
    if proxy[user_pass] is not None:
        request.meta[proxy] = "http://%s" % proxy[ip_port]
        encodebytes = base64.b64encode(proxy[user_pass].encode(encoding=utf-8))  # 注意encodebytes类型是byte,不是str
        encoded_user_pass = str(encodebytes, utf-8)
        request.headers[Proxy-Authorization] = Basic  + encoded_user_pass
    else:
        request.meta[proxy] = "http://%s" % proxy[ip_port]

if __name__ == __main__:

    pool = ThreadPool()
    pool.apply_async(task1)
    pool.apply_async(process_request)
    pool.close()
    pool.join()
优化版

 

操作excel文件爬取nvd.nist数据

原文:https://www.cnblogs.com/jackzz/p/11160978.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!