爬虫使用-简单(python3入门)

时间：2019-05-03 00:38:25 阅读：164 评论：0 收藏：0 [点我收藏+]

爬虫是什么？
- 每个网站都有爬虫协议，(例如：https://www.baidu.com/robots.txt，这里会写清楚哪些允许哪些不被允许)
- 可见即可爬（技术上）
- 违法的：擦边球

一、request模块（模拟发请求的模块）

- 安装：pip3 install requests.  --- urllib,urllib2 （这两个是py内置的），requests模块是基于这两个模块封装的

# **** 基本使用 ****
# 导入模块
# import requests
#
# # 发送get请求，有返回结果
# res = requests.get(‘https://www.baidu.com‘)
#
# # 请求回来的内容
# print(res.text)
#
# with open(‘a.html‘,‘w‘,encoding=‘utf-8‘) as f:
#     f.write(res.text)
#
#
# # 请求返回的状态码
# print(res.status_code)

requests模块介绍

# **** 携带参数 中文需要编码****
# import requests
# from urllib.parse import urlencode
#
# key = input(‘请输入要搜索的内容‘)
# # 如果携带参数是中文或者其他特殊字符要做转码
# key_search = urlencode({‘wd‘:key})
# # print(key_search)
#
# # url = ‘https://www.baidu.com/s?%s‘%key_search
# url = ‘https://www.baidu.com/s?%s‘%key_search
#
#
# # 反扒之一：携带http头信息 user-agent
# res = requests.get(url,
#                    headers={
#                        ‘User-Agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘
#                    }
#
#                    )
#
#
#
#
#
#
# with open(‘a.html‘,‘w‘,encoding=‘utf-8‘) as f:
#     f.write(res.text)

request简单使用-1

# 每次编码比较复杂，直接使用requests模块的参数
# import requests
#
# key = input(‘请输入要搜索的内容‘)
#
# # 反扒之一：携带http头信息 user-agent
# res = requests.get(‘https://www.baidu.com/s‘,
#                    headers={
#                        ‘User-Agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘
#                    },
#                    # get形式携带的参数
#                    params={‘wd‘:key,‘pn‘:70}
#                    )
#
# print(res.text)
#
# with open(‘a.html‘,‘w‘,encoding=‘utf-8‘) as f:
#     f.write(res.text)


# cookie
# import requests
#
# Cookies={   ‘user_session‘:‘2OjTrCoCXigz5gB7trCsYfMBl0jQ-abqjXdCcas9UqwVmD7y‘,
# }
#
# response=requests.get(‘https://github.com/settings/emails‘,
#              cookies=Cookies) #github对请求头没有什么限制，我们无需定制user-agent，对于其他网站可能还需要定制
#
#
# print(‘lich_qiu@163.com‘ in response.text) #True

requests简单使用-2

- get参数介绍
    params = 字典(get形式传的参数)
    headers = 字典()
        - User-Agent : 客户端类型
        - Referer : 从哪个地址调过来的(上一个地址)，图片防盗链
        - Host : 
        - Cookie : ‘字符串‘
    Cookie : {‘user_session‘:‘xxx‘} 因为cookie比较特殊，经常用的到。正常应该放在请求头当中，requests模块单独处理了cookie

- post参数介绍
    params
    headers
    cookie
    data:请求体的数据，默认用urlencoded格式
    json:传字典，这样发送的请求编码格式是： ‘content-type‘: ‘application/json‘
    allow_redirect = False 是否允许重定向，默认是True，一般不会去更改。



# 第一步：向https://github.com/login 发送get请求，
# import requests
# import re
#
# res_login = requests.get(‘https://github.com/login‘,
#                          headers={
#                              ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘
#                          }
#                          )
# # print(res_login.text)
#
# #   返回数据中有个authenticity_token 取出来
# #   re.S 把这个字符串当作一行
# authenticity_token = re.findall(r‘name="authenticity_token".*?value="(.*?)"‘, res_login.text, re.S)[0]
# print(authenticity_token)
#
# #   取出没有登入的cookie
# login_cookie = res_login.cookies.get_dict()
# print(login_cookie)
#
# # 第二步：向https://github.com/session 携带用户名+密码并发送post请求
#
# data = {
#     ‘commit‘: ‘Sign in‘,
#     ‘utf8‘: ‘?‘,
#     ‘authenticity_token‘: authenticity_token,
#     ‘login‘: ‘lich_qiu@163.com‘,
#     ‘password‘: ‘zhang319!‘,
#     ‘webauthn-support‘: ‘supported‘
# }
#
# res = requests.post(url=‘https://github.com/session ‘,
#
#                     # 请求体的数据
#                     data=data,
#                     # 需要携带没有通过认证的cookie
#                     cookies=login_cookie,
#
#                     )
#
# #   正常登入成功，返回cookie，取出cookie，下次发请求，携带着cookie
# #   res.cookies.get_dict() 把返回的cookie转成字典
# res_cookie = res.cookies.get_dict()
# print(res_cookie)
#
# # 第三步：访问https://github.com/settings/emails ，判断lich_qiu@163.com 是否在返回的数据中
#
# response = requests.get(‘https://github.com/settings/emails‘,
#                         headers={
#                             ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘,
#                             ‘Referer‘: ‘https://github.com/settings/profile‘
#                         },
#                         cookies=res_cookie,
#
#                         )  # github对请求头没有什么限制，我们无需定制user-agent，对于其他网站可能还需要定制
#
# print(‘lich_qiu@163.com‘ in response.text)  # True

get和post介绍及简单使用

# 编码问题
# import requests
# response = requests.get(‘http://www.autohome.com/news‘)
#
# # 当前页面编码方式
# print(response.apparent_encoding)
#
# # 将编码方式改成gbk
# response.encoding = ‘gbk‘
# print(response.text)

编码问题

# 爬图片
# 比较小的文件，可以一次性把content爬下来

# import requests
# res = requests.get(‘https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1556732811646&di=2bd8396b35047f33fbcd6b023f5787b7&imgtype=0&src=http%3A%2F%2Fs15.sinaimg.cn%2Fmw690%2F0066UWNtgy6Viz3mEBoce%26690‘)
#
# with open(‘a.jpg‘,‘wb‘) as f:
#     f.write(res.content)

爬图片

# 爬视频
# 文件较大，有个iter_content()方法来循环爬

# import requests
#
# res = requests.get(‘http://static.yximgs.com/s1/videos/www_main-059ce9beee.mp4‘)
# with open(‘a.mp4‘,‘wb‘)as f:
#     for i in res.iter_content():
#         f.write(i)

爬视频

#解析json
# import requests
#
# response = requests.get(‘http://httpbin.org/get‘)
#
# import json
# res1 = json.loads(response.text)    # 太麻烦
#
# res2 = response.json()  #直接获取json数据
#
# print(res1 == res2)   #结果是True，结果一致

解析json

- 响应response
    print(respone.text) --- 输出文本的内容
    print(respone.content) --- 输出二进制的内容

    print(respone.status_code) --- 状态码
    print(respone.headers) --- 响应头
    print(respone.cookies) --- 返回的cookie
    print(respone.cookies.get_dict()) --- 把返回的cookie转成字典格式
    print(respone.cookies.items()) --- 字典.items()

    print(respone.url) --- 要重定向的地址
    print(respone.history) --- 正常返回的数据

    print(respone.encoding) --- 返回数据的编码格式

响应头使用介绍

二、requests模块高级用法

# 1 ssl cert verification
    # verify = False 不认证证书
# import requests
#携带证书
# response = requests.get(‘https://www.12306.cn‘,
#                         cert = (‘/path/server.crt/path/key‘))

1 ssl cert verification

# 2 使用代理
# http代理
# import requests
#
# proxies = {
#     ‘http‘:‘http://lich:123@112.85.151.216:9999‘,    # 带用户密码的代理，@符号前面是用户名与密码
#     ‘http‘:‘223.241.116.173:8010‘,
#     ‘https‘:‘https://localhost:8888‘
# }
#
# response = requests.get(‘https://www.12306.cn‘,proxies=proxies)
# print(response.status_code)

2 使用代理

# socket代理
# import requests
# proxies = {
#     ‘http‘:‘socks5://lich:123@112.85.151.216:9999‘,    # 带用户密码的代理，@符号前面是用户名与密码
#     # ‘http‘:‘socks5://223.241.116.173:8010‘,
#     # ‘https‘:‘socks5://localhost:8888‘
# }
#
# response = requests.get(‘https://www.12306.cn‘,proxies=proxies)
#
# print(response.status_code)

3 socket代理

# 超时设置
# import requests
# response = requests.get(‘https://www.12306.cn‘,timeout = 0.0001)

4 超时设置

# 上传文件
# import requests
# files = {
#     ‘file‘:open(‘a.jpg‘,‘rb‘)
# }
# response = requests.post(‘http://httpbin.org/post‘,files = files)
# print(response.status_code)

5 上传文件

三、爬虫项目案例

# 单线程爬取
# import requests
# import re
# import os
#
# # 通用的通过地址获取页面内容的方法：
# def get_page(url):
#     ret = requests.get(url)
#     if ret.status_code == 200:
#         return ret.text
#
#
# def parse_res(text):
#     # <a href="video_1549859" class="vervideo-lilink actplay">
#     urls = re.findall(r‘class="categoryem".*?href="(.*?)" ‘,text,re.S)
#     print(urls)
#     for url in urls:
#         print(url)
#         yield ‘https://www.pearvideo.com/‘ + url
#
#
# def parse_detail(text):
#     # print(text)
#     movie_url = re.findall(‘srcUrl="(.*?)"‘,text,re.S)[0]
#     # print(‘视频文件的实际地址:‘,movie_url)
#     return movie_url
#
# def base_dir():
#     base = os.path.dirname(os.path.abspath(__file__))
#     return base
#
#
# def download_movie(url):
#     import time
#     movie_content = requests.get(url)
#     file_name = str(time.time())+‘.mp4‘
#     with open(‘%s/download/%s‘%(base_dir(),file_name),‘wb‘)as f:
#         f.write(movie_content.content)
#
#
#
# if __name__ == ‘__main__‘:
#     res = get_page(‘https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=6&start=1‘)
#     # res是这个页面的内容
#     urls = parse_res(res)
#     for url in urls:
#         try:
#             res_detail = get_page(url)
#             movie_url = parse_detail(res_detail)
#             download_movie(movie_url)
#         except Exception as e:
#             print(e)

单线程爬梨视频

# 多线程
import requests
import re
import os
from concurrent.futures import ThreadPoolExecutor

# 先生成一个有60个线程的池
pool = ThreadPoolExecutor(60)


# 通用的通过地址获取页面内容的方法：
def get_page(url):
    ret = requests.get(url)
    if ret.status_code == 200:
        return ret.text


def parse_res(text):
    #从text中取出上个函数执行的返回结果
    text = text.result()
    urls = re.findall(r‘class="categoryem".*?href="(.*?)" ‘,text,re.S)
    print(urls)
    for url in urls:
        print(url)
        # yield ‘https://www.pearvideo.com/‘ + url
        pool.submit(get_page,‘https://www.pearvideo.com/‘+url).add_done_callback(parse_detail)

def parse_detail(text):
    # print(text)
    text = text.result()
    movie_url = re.findall(‘srcUrl="(.*?)"‘,text,re.S)[0]
    # print(‘视频文件的实际地址:‘,movie_url)
    pool.submit(download_movie,movie_url)

def base_dir():
    base = os.path.dirname(os.path.abspath(__file__))
    base = os.path.join(base,‘download‘)
    return base


def download_movie(url):
    import time
    movie_content = requests.get(url)
    file_name = str(time.time())+‘.mp4‘
    file = os.path.join(base_dir(),file_name)
    if movie_content.status_code == 200:
        with open(file,‘wb‘)as f:
            f.write(movie_content.content)


if __name__ == ‘__main__‘:
    for i in range(3):
        url = ‘https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=6&start=%s‘%(i*12+1)
        # 把获取页面的方法提交到线程中
        # add_done_callback()当该线程执行完成后，执行的方法
        pool.submit(get_page,url).add_done_callback(parse_res)

多线程爬梨视频

#导入 beautifulsoup模块
from bs4 import BeautifulSoup
import time
import os

# https://www.autohome.com.cn/news/2/#liststart

for i in range(1,10):
    url = ‘https://www.autohome.com.cn/news/%s/#liststart‘%i
    ret = requests.get(url)
    # print(ret.text)


    # soup = BeautifulSoup(ret.text,‘lxml‘)
    soup = BeautifulSoup(ret.text,‘html.parser‘)

    ul = soup.find(name=‘ul‘,attrs={‘class‘:‘article‘})

    li_list = ul.find_all(name=‘li‘)
    for li in li_list:

        try:
            # 取出新闻的url
            news_url = ‘https:‘ + li.find(name=‘a‘).get(‘href‘)    #取出属性
            news_title = li.find(name=‘h3‘).text        #取出h3标签的文本
            news_desc = li.find(name=‘p‘).text          #取出新闻的简介
            news_img = ‘https:‘ + li.find(name=‘img‘).get(‘src‘)   #取到新闻的图片

            print(
                ‘‘‘
                新闻标题：%s
                新闻摘要：%s
                新闻地址：%s
                新闻图片地址：%s        
                ‘‘‘%(news_title,news_desc,news_url,news_img)
            )

            # 下载新闻的图片
            response = requests.get(news_img)
            time_name = str(time.time()) + ‘.jpg‘
            base_path = os.path.dirname(os.path.abspath(__file__))
            download_path = os.path.join(base_path,‘download‘)
            file_name = os.path.join(download_path,time_name)
            with open(file_name,‘wb‘)as f:
                f.write(response.content)


        except Exception as e:
            print(e)

多线程爬汽车之家新闻

# import requests

# ret = requests.get(‘https://dig.chouti.com‘,
#                    headers={
#                        ‘user-agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘
#                    }
#                    )
#
#
# print(ret.status_code)
# print(ret.text)


# 模拟登入，状态码：9999登入成功,并不能成功点赞
# ret = requests.post(‘https://dig.chouti.com/login‘,
#                     headers={
#                         ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘
#
#                     },
#                     data={
#                         ‘phone‘: ‘8618901847206‘,
#                         ‘password‘: ‘123.abcd‘,
#                         ‘oneMonth‘: 1
#                     }
#                     )
# # print(ret.text)
# # 取出登入成功后的cookie
# cookie = ret.cookies.get_dict()
#
#
# # 给文章点赞，向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求
# res = requests.post(‘https://dig.chouti.com/link/vote?linksId=25944651‘,
#
#                     headers={
#                         ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘,
#                         ‘referer‘:‘https://dig.chouti.com/‘
#
#                     },
#
#                     cookies = cookie
#                     )
# print(res.text)

抽屉自动点赞-分析

# 第一步：先打开抽屉首页
import requests
from bs4 import BeautifulSoup

ret = requests.get(‘https://dig.chouti.com/‘,

                   headers={
                       ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘,

                   },

                   )
ret_cookie = ret.cookies.get_dict()

# 第二步：模拟登入
res = requests.post(‘https://dig.chouti.com/login‘,

                    headers={
                        ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘,
                        ‘referer‘: ‘https://dig.chouti.com/‘
                    },
                    cookies=ret_cookie,

                    data={
                        ‘phone‘: ‘8618901847206‘,
                        ‘password‘: ‘123.abcd‘,
                        ‘oneMonth‘: 1
                    }

                    )
print(res.text)
res_cookie = res.cookies.get_dict()

# # 第三步：给文章点赞(这个写死了，用下面那个第三步）
#
# # 给文章点赞，向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求
# response = requests.post(‘https://dig.chouti.com/link/vote?linksId=25944651‘,
#
#                     headers={
#                         ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘,
#                         ‘referer‘:‘https://dig.chouti.com/‘
#
#                     },
#
#                     cookies = ret_cookie
#                     )
# print(response.text)

# 登入成功第三步，进入首页

post_url_list = []
for i in range(5,10):

    response = requests.get(‘https://dig.chouti.com/all/hot/recent/%s‘%i,
                            headers={
                                ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘,
                                ‘referer‘: ‘https://dig.chouti.com/‘
                            }
                            )

    soup = BeautifulSoup(response.text, ‘html.parser‘)

    div_all = soup.find(name=‘div‘,attrs={‘class‘:‘content-list‘,‘id‘:‘content-list‘})
    div_list = div_all.find_all(name=‘div‘,attrs={‘class‘:‘news-pic‘})

    for div in div_list:

        try:
            news_id = div.find(name=‘img‘).get(‘lang‘)
            # ‘https://dig.chouti.com/link/vote?linksId=%s‘%news_id
            post_url = ‘https://dig.chouti.com/link/vote?linksId=%s‘%news_id
            post_url_list.append(post_url)

        except Exception as e:
            print(‘这里报错了哟‘,e)

# print(post_url_list)


# 第四步：循环给文章点赞
# 给文章点赞，向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求

for url in post_url_list:
    up_news = requests.post(url,
                        headers={
                            ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘,
                            ‘referer‘:‘https://dig.chouti.com/‘

                        },
                        cookies = ret_cookie
                        )
    print(up_news.text)

抽屉自动点赞-实现

# 第一步：先打开抽屉首页
import requests
from bs4 import BeautifulSoup

ret = requests.get(‘https://dig.chouti.com/‘,

                   headers={
                       ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘,

                   },

                   )
ret_cookie = ret.cookies.get_dict()

# 第二步：模拟登入
res = requests.post(‘https://dig.chouti.com/login‘,

                    headers={
                        ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘,
                        ‘referer‘: ‘https://dig.chouti.com/‘
                    },
                    cookies=ret_cookie,

                    data={
                        ‘phone‘: ‘8618901847206‘,
                        ‘password‘: ‘123.abcd‘,
                        ‘oneMonth‘: 1
                    }

                    )
print(res.text)
res_cookie = res.cookies.get_dict()

# 登入成功第三步，进入首页，获取新闻ID并拼接url加入列表

post_url_list = []
news_id_list = []
for i in range(5, 10):

    response = requests.get(‘https://dig.chouti.com/all/hot/recent/%s‘ % i,
                            headers={
                                ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘,
                                ‘referer‘: ‘https://dig.chouti.com/‘
                            }
                            )

    soup = BeautifulSoup(response.text, ‘html.parser‘)

    div_all = soup.find(name=‘div‘, attrs={‘class‘: ‘content-list‘, ‘id‘: ‘content-list‘})
    div_list = div_all.find_all(name=‘div‘, attrs={‘class‘: ‘news-pic‘})

    for div in div_list:

        try:
            news_id = div.find(name=‘img‘).get(‘lang‘)
            news_id_list.append(news_id)
            # ‘https://dig.chouti.com/link/vote?linksId=%s‘%news_id
            post_url = ‘https://dig.chouti.com/link/vote?linksId=%s‘ % news_id
            post_url_list.append(post_url)

        except Exception as e:
            print(‘这里报错了哟‘, e)

# 第四步：循环给文章取消点赞
# 给文章取消点赞，向https://dig.chouti.com/vote/cancel/vote.do 发送post请求，并携带form_data= {linksId: 25933276}


url = ‘https://dig.chouti.com/vote/cancel/vote.do‘
for news_id in news_id_list:
    up_news = requests.post(url,
                            headers={
                                ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘,
                                ‘referer‘: ‘https://dig.chouti.com/‘

                            },
                            cookies=ret_cookie,
                            data={
                                ‘linksId‘: news_id
                            }
                            )
    print(up_news.text)

抽屉自动取消点赞-实现

四、bs4模块介绍

# html_doc = """
# <html><head><title>The Dormouse‘s story</title></head>
# <body>
# <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse‘s story</b></p>
#
# <p class="story">Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>
#
# <p class="story">...</p>
# """

#1、用法
# from bs4 import BeautifulSoup
# soup=BeautifulSoup(html_doc,‘lxml‘)
# # soup=BeautifulSoup(open(‘a.html‘),‘lxml‘)
#
# print(soup.p) #存在多个相同的标签则只返回第一个
# print(soup.p.b.text)    #The Dormouse‘s story
# print(soup.p.b.get(‘class‘))    #[‘boldest‘]
# print(soup.a) #存在多个相同的标签则只返回第一个
#
# #2、获取标签的名称
# print(soup.p.name)
#
# #3、获取标签的属性
# print(soup.p.attrs)
#
# #4、获取标签的内容
# print(soup.p.string) # p下的文本只有一个时，取到，否则为None
# print(soup.p.strings) #拿到一个生成器对象, 取到p下所有的文本内容
# print(soup.p.text) #取到p下所有的文本内容
# for line in soup.stripped_strings: #去掉空白
#     print(line)
#
#
# ‘‘‘
# 如果tag包含了多个子节点,tag就无法确定 .string 方法应该调用哪个子节点的内容, .string 的输出结果是 None，如果只有一个子节点那么就输出该子节点的文本，比如下面的这种结构，soup.p.string 返回为None,但soup.p.strings就可以找到所有文本
# <p id=‘list-1‘>
#     哈哈哈哈
#     <a class=‘sss‘>
#         <span>
#             <h1>aaaa</h1>
#         </span>
#     </a>
#     <b>bbbbb</b>
# </p>
# ‘‘‘
#
# #5、嵌套选择
# print(soup.head.title.string)
# print(soup.body.a.string)
#
#
# #6、子节点、子孙节点
# print(soup.p.contents) #p下所有子节点
# print(soup.p.children) #得到一个迭代器,包含p下所有子节点
#
# for i,child in enumerate(soup.p.children):
#     print(i,child)
#
# print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来
# for i,child in enumerate(soup.p.descendants):
#     print(i,child)
#
# #7、父节点、祖先节点
# print(soup.a.parent) #获取a标签的父节点
# print(soup.a.parents) #找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...
#
#
# #8、兄弟节点
# print(‘=====>‘)
# print(soup.a.next_sibling) #下一个兄弟
# print(soup.a.previous_sibling) #上一个兄弟
#
# print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
# print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象

1 遍厉文档树

html_doc = """
<html><head><title>The Dormouse‘s story</title></head>
<body>
<p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse‘s story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

# 五种过滤器： 字符串，正则表达式， 列表， True， 方法
# soup.find()
#find(self, name=None, attrs={}, recursive=True, text=None,**kwargs):
# name：标签名， attrs：属性， text：文本内容， recursive=False表示不递归查找、默认是True， **kwargs

from bs4 import BeautifulSoup

# 1 字符串精准匹配
# soup = BeautifulSoup(html_doc,‘lxml‘)
# # ret = soup.find(name=‘body‘)  #指定条件1
# # ret = soup.find(attrs={‘class‘:‘title‘})   #指定条件2
# ret = soup.find(text="The Dormouse‘s story")    #指定条件3
# print(ret)
# print(type(ret))


# 2 正则表达式匹配
# import re
# soup = BeautifulSoup(html_doc,‘lxml‘)
# # ret = soup.find(name=re.compile(‘^p‘))
# # ret = soup.find(attrs={‘class‘:re.compile(‘^s‘)})
# ret = soup.find(name=‘a‘,text=re.compile(‘^L‘))
# print(ret)




# 3 列表匹配
soup = BeautifulSoup(html_doc,‘lxml‘)
# ret =soup.find_all(name=[‘a‘,‘b‘])
# ret =soup.find_all(attrs={‘class‘:[‘title‘,‘sister‘]})
ret =soup.find_all(text=[‘Elsie‘,‘Lacie‘])
print(ret)

2 搜索文档树

五、selenium模块介绍

# 最基本的用法
# from selenium import webdriver
# import time
#
#
# #webdriver.Chrome()得到一个对象，相当于我的浏览器
# browser = webdriver.Chrome()
# browser.get(‘https://www.baidu.com‘)
# print(browser.page_source)
#
# time.sleep(2)
#
# #关闭浏览器（务必）
# browser.close()

#
# from selenium import webdriver
# import time

基本使用

#### 所有选择器用法
    # 1、find_element_by_id  通过id查找
    # 2、find_element_by_link_text   通过连接文字
    # 3、find_element_by_partial_link_text    通过连接文字模糊查找
    # 4、find_element_by_tag_name    通过标签查找
    # 5、find_element_by_class_name  通过类名查找
    # 6、find_element_by_name
    # 7、find_element_by_css_selector
    # 8、find_element_by_xpath
#### 所有选择器用用法

选择器介绍

#简单使用1 打开百度，在百度搜索栏中搜索美女关键字
# try:
#     browser = webdriver.Chrome()
#     browser.get(‘https://www.baidu.com‘)
#
#     time.sleep(2)
#
#     search_input = browser.find_element_by_id(‘kw‘)
#     key = input(‘请输入要搜索的内容‘)
#     search_input.send_keys(key)
#     time.sleep(5)
#
#
#
#
# except Exception as e:
#     print(e)
# finally:
#     browser.close()

简单使用-1

# 简单使用2 打开百度 并完成登入
# try:
#     browser = webdriver.Chrome()
#     # 表示取控件的时候，如果取不到，则等待3秒
#     # 隐式等待
#     browser.implicitly_wait(3)
#     browser.get(‘https://www.baidu.com‘)
#
#     time.sleep(2)
#
#     login_btn = browser.find_element_by_link_text(‘登录‘)
#     login_btn.click()
#     user_login = browser.find_element_by_id(‘TANGRAM__PSP_10__footerULoginBtn‘)
#     user_login.click()
#     username_input = browser.find_element_by_id(‘TANGRAM__PSP_10__userName‘)
#     username_input.send_keys(‘13681878977‘)
#     password_input = browser.find_element_by_id(‘TANGRAM__PSP_10__password‘)
#     password_input.send_keys(‘zhang319!‘)
#     submit_btn = browser.find_element_by_id(‘TANGRAM__PSP_10__submit‘)
#     submit_btn.click()
#     time.sleep(5)
#
#     search_input = browser.find_element_by_id(‘kw‘)
#     search_input.send_keys(‘名侦探柯南‘)
#     time.sleep(10)
#
#
# except Exception as e:
#     print(e)
# finally:
#     browser.close()

简单使用-2

# 简单使用3 爬取京东商品信息

# from selenium import webdriver
# from selenium.webdriver.common.keys import Keys
# import time
#
#
# def get_goods(browser):
#     li_list = browser.find_elements_by_class_name(‘gl-item‘)
#     for li in li_list:
#         goods_price = li.find_element_by_css_selector(‘.p-price i‘).text
#         # print(goods_price)
#
#         goods_comment = li.find_element_by_css_selector(‘.p-commit strong a‘).text
#         # print(goods_comment)
#
#         goods_name = li.find_element_by_css_selector(‘.p-name-type-2 a‘).get_attribute(‘title‘)
#         # print(goods_name)
#
#         goods_url = li.find_element_by_css_selector(‘.p-name-type-2 a‘).get_attribute(‘href‘)
#
#         goods_img = li.find_element_by_css_selector(‘.p-img a img‘).get_attribute(‘src‘)
#         if not goods_img:
#             goods_img = ‘https:‘+li.find_element_by_css_selector(‘.p-img a img‘).get_attribute(‘data-lazy-img‘)
#
#         print(
#             ‘‘‘
#             商品名称：%s
#             商品价格：%s
#             商品评论数：%s
#             商品详情地址：%s
#             商品图片地址：%s
#             ‘‘‘% (goods_name, goods_price, goods_comment, goods_url, goods_img)
#         )
#
#     next_page = browser.find_element_by_partial_link_text(‘下一页‘)
#     time.sleep(2)
#     next_page.click()
#     get_goods(browser)
#
#
#
#
#
#
#
#
# def spider():
#     try:
#         browser = webdriver.Chrome()
#         browser.implicitly_wait(3)
#
#         browser.get(‘https://www.jd.com‘)
#
#         search_input = browser.find_element_by_id(‘key‘)
#         search_input.send_keys(‘手机‘)
#         search_input.send_keys(Keys.ENTER)
#         time.sleep(5)
#
#
#         #取出页面中商品信息
#         get_goods(browser)
#
#
#     except Exception as e:
#         print(e)
#     finally:
#         browser.close()
#
# if __name__ == ‘__main__‘:
#     spider()

简单使用-3 (爬取京东商品信息)

# 简单实用4 模拟浏览器前进后退
# import time
# from selenium import webdriver
#
# browser=webdriver.Chrome()
# browser.get(‘https://www.baidu.com‘)
# browser.get(‘https://www.taobao.com‘)
# browser.get(‘http://www.sina.com.cn/‘)
#
# browser.back()
# time.sleep(10)
# browser.forward()
# browser.close()


# 执行js代码
# from selenium import webdriver
# import time
#
# browser=webdriver.Chrome()
# browser.get(‘https://www.baidu.com‘)
# browser.execute_script(‘alert(1234)‘)
# #执行后js代码后，.close()不能生效
# browser.close()



#选项卡管理：切换选项卡，有js的方式windows.open,有windows快捷键：ctrl+t等，最通用的就是js的方式
# import time
# from selenium import webdriver
#
# browser=webdriver.Chrome()
# browser.get(‘https://www.baidu.com‘)
# browser.execute_script(‘window.open()‘)
#
# print(browser.window_handles) #获取所有的选项卡
# browser.switch_to_window(browser.window_handles[1])
# browser.get(‘https://www.taobao.com‘)
# time.sleep(5)
# browser.switch_to_window(browser.window_handles[0])
# browser.get(‘https://www.sina.com.cn‘)
# browser.close()




# 控制鼠标滑动
# from selenium import webdriver
# from selenium.webdriver import ActionChains
# from selenium.webdriver.common.by import By  # 按照什么方式查找，By.ID,By.CSS_SELECTOR
# from selenium.webdriver.common.keys import Keys  # 键盘按键操作
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
# import time
#
# driver = webdriver.Chrome()
# driver.get(‘http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable‘)
# wait=WebDriverWait(driver,3)
# # driver.implicitly_wait(3)  # 使用隐式等待
#
# try:
#     driver.switch_to.frame(‘iframeResult‘) ##切换到iframeResult
#     sourse=driver.find_element_by_id(‘draggable‘)
#     target=driver.find_element_by_id(‘droppable‘)
#
#     #方式一：基于同一个动作链串行执行
#     # actions=ActionChains(driver) #拿到动作链对象
#     # actions.drag_and_drop(sourse,target) #把动作放到动作链中，准备串行执行
#     # actions.perform()
#
#     #方式二：不同的动作链，每次移动的位移都不同
#     ActionChains(driver).click_and_hold(sourse).perform()
#     distance=target.location[‘x‘]-sourse.location[‘x‘]
#
#     track=0
#     while track < distance:
#         ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
#         track+=2
#
#     ActionChains(driver).release().perform()
#
#     time.sleep(10)
#
# finally:
#     driver.close()

简单使用-4 模拟浏览器操作

六、微信自动回复机器人

# 好友性别饼状图
# from wxpy import *
# from pyecharts import Pie
# # import webbrowser
# bot = Bot(cache_path=True)  # 注意手机登陆确认
#
# # 拿到所有朋友对象，放到列表里
# friends = bot.friends()
#
# attr = [‘男朋友‘,‘女朋友‘,‘未知性别‘]
# value = [0,0,0]
#
# for friend in friends:
#     if friend.sex == 1: #1代表男性
#         value[0] +=1
#     elif friend.sex == 2:   #2代表女性
#         value[1] += 1
#     else:
#         value[2] += 1
#
# pie = Pie(‘朋友男女比例‘)
# pie.add("",attr, value, is_label_show=True)
#
# # 图表名称str ， 属性名称list ， 属性所对应的值list  ， is_label_show是否现在标签
# pie.render(‘sex.html‘)
#
# #打开浏览器
# from selenium import webdriver
# import time
# browser = webdriver.Chrome()
# browser.get(‘/Users/lich/PycharmProjects/w3spider_Proj/sex.html‘)
# time.sleep(10)
# browser.close()

好友统计饼状图

# 好友地域省份分布图
# from wxpy import *
# from pyecharts import Map
# from pyecharts import Pie
# import webbrowser
# bot = Bot(cache_path=True)  # 注意手机登陆确认
#
# # 拿到所有朋友对象，放到列表里
# friends = bot.friends()
#
# area_dic = {}       #定义一个省份字典
#
# for friend in friends:
#     if friend.province not in area_dic:
#         area_dic[friend.province] = 1
#     else:
#         area_dic[friend.province] += 1
# attr = area_dic.keys()
# value = area_dic.values()
#
# map = Map(‘好朋友们的地域分布‘, width= 1200,height=600)
# map.add(
#     "好友地域分布",
#     attr,
#     value,
#     maptype=‘china‘,
#     is_visualmap = True, #结合体VisualMap
#
# )
#
# # is_visualmap -> bool 是否使用视觉映射组件
# map.render(‘area.html‘)

好友省份地域分布图

# 全好友自动回复：
# from wxpy import *
# bot=Bot(cache_path=True)
#
# @bot.register()
# def recv_send_msg(recv_msg):
#     print(‘收到的消息：‘,recv_msg.text) # recv_msg.text取得文本
#     return ‘自动回复：%s‘ %recv_msg.text
#
# # 进入Python命令行，让程序保持运行
# embed()

全好友自动回复

#自动给老婆回复信息
# from wxpy import *
# bot=Bot(cache_path=True)
#
# girl_friend=bot.search(‘老婆‘)[0]
# print(girl_friend)
#
# @bot.register() # 接收从指定好友发来的消息，发送者即recv_msg.sender为指定好友girl_friend
# def recv_send_msg(recv_msg):
#     # print(‘收到的消息：‘,recv_msg.text) # recv_msg.text取得文本
#     if recv_msg.sender == girl_friend:
#         recv_msg.forward(bot.file_helper,prefix=‘老婆留言: ‘) #在文件传输助手里留一份，方便自己忙完了回头查看
#         ms=‘老婆最美丽，我对老婆的爱如滔滔江水，连绵不绝‘
#         print(‘>>>给老婆回复的：‘, ms)
#         return  ms#给老婆回一份
#
# embed()

自动给老婆回复-1

#使用图灵机器人自动回复

import json
import requests
from wxpy import *
bot = Bot(cache_path=True)

# 调用图灵机器人API，发送消息并获得机器人的回复
def auto_reply(text):
    url = "http://www.tuling123.com/openapi/api"
    api_key = "9df516a74fc443769b233b01e8536a42"
    payload = {
        "key": api_key,
        "info": text,
    }
    r = requests.post(url, data=json.dumps(payload))
    result = json.loads(r.content)
    # return "[来自智能机器人] " + result["text"]
    return result["text"]


girl_friend=bot.search(‘老婆‘)[0]

@bot.register()
def forward_message(msg):
    if msg.sender == girl_friend:
        return auto_reply(msg.text)


embed()

自动给老婆回复-2 (使用图灵机器人)

爬虫使用-简单(python3入门)

原文：https://www.cnblogs.com/lich1x/p/10803918.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)