爬虫是什么?
- 每个网站都有爬虫协议,(例如:https://www.baidu.com/robots.txt,这里会写清楚哪些允许 哪些不被允许)
- 可见即可爬(技术上)
- 违法的:擦边球
一、request模块(模拟发请求的模块)
- 安装:pip3 install requests. --- urllib,urllib2 (这两个是py内置的),requests模块是基于这两个模块封装的 # **** 基本使用 **** # 导入模块 # import requests # # # 发送get请求,有返回结果 # res = requests.get(‘https://www.baidu.com‘) # # # 请求回来的内容 # print(res.text) # # with open(‘a.html‘,‘w‘,encoding=‘utf-8‘) as f: # f.write(res.text) # # # # 请求返回的状态码 # print(res.status_code)
# **** 携带参数 中文需要编码**** # import requests # from urllib.parse import urlencode # # key = input(‘请输入要搜索的内容‘) # # 如果携带参数是中文或者其他特殊字符要做转码 # key_search = urlencode({‘wd‘:key}) # # print(key_search) # # # url = ‘https://www.baidu.com/s?%s‘%key_search # url = ‘https://www.baidu.com/s?%s‘%key_search # # # # 反扒之一:携带http头信息 user-agent # res = requests.get(url, # headers={ # ‘User-Agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘ # } # # ) # # # # # # # with open(‘a.html‘,‘w‘,encoding=‘utf-8‘) as f: # f.write(res.text)
# 每次编码比较复杂,直接使用requests模块的参数 # import requests # # key = input(‘请输入要搜索的内容‘) # # # 反扒之一:携带http头信息 user-agent # res = requests.get(‘https://www.baidu.com/s‘, # headers={ # ‘User-Agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘ # }, # # get形式携带的参数 # params={‘wd‘:key,‘pn‘:70} # ) # # print(res.text) # # with open(‘a.html‘,‘w‘,encoding=‘utf-8‘) as f: # f.write(res.text) # cookie # import requests # # Cookies={ ‘user_session‘:‘2OjTrCoCXigz5gB7trCsYfMBl0jQ-abqjXdCcas9UqwVmD7y‘, # } # # response=requests.get(‘https://github.com/settings/emails‘, # cookies=Cookies) #github对请求头没有什么限制,我们无需定制user-agent,对于其他网站可能还需要定制 # # # print(‘lich_qiu@163.com‘ in response.text) #True
- get参数介绍 params = 字典(get形式传的参数) headers = 字典() - User-Agent : 客户端类型 - Referer : 从哪个地址调过来的(上一个地址),图片防盗链 - Host : - Cookie : ‘字符串‘ Cookie : {‘user_session‘:‘xxx‘} 因为cookie比较特殊,经常用的到。正常应该放在请求头当中,requests模块单独处理了cookie - post参数介绍 params headers cookie data:请求体的数据,默认用urlencoded格式 json:传字典,这样发送的请求编码格式是: ‘content-type‘: ‘application/json‘ allow_redirect = False 是否允许重定向,默认是True,一般不会去更改。 # 第一步:向https://github.com/login 发送get请求, # import requests # import re # # res_login = requests.get(‘https://github.com/login‘, # headers={ # ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘ # } # ) # # print(res_login.text) # # # 返回数据中有个authenticity_token 取出来 # # re.S 把这个字符串当作一行 # authenticity_token = re.findall(r‘name="authenticity_token".*?value="(.*?)"‘, res_login.text, re.S)[0] # print(authenticity_token) # # # 取出没有登入的cookie # login_cookie = res_login.cookies.get_dict() # print(login_cookie) # # # 第二步:向https://github.com/session 携带用户名+密码并发送post请求 # # data = { # ‘commit‘: ‘Sign in‘, # ‘utf8‘: ‘?‘, # ‘authenticity_token‘: authenticity_token, # ‘login‘: ‘lich_qiu@163.com‘, # ‘password‘: ‘zhang319!‘, # ‘webauthn-support‘: ‘supported‘ # } # # res = requests.post(url=‘https://github.com/session ‘, # # # 请求体的数据 # data=data, # # 需要携带没有通过认证的cookie # cookies=login_cookie, # # ) # # # 正常登入成功,返回cookie,取出cookie,下次发请求,携带着cookie # # res.cookies.get_dict() 把返回的cookie转成字典 # res_cookie = res.cookies.get_dict() # print(res_cookie) # # # 第三步:访问https://github.com/settings/emails ,判断lich_qiu@163.com 是否在返回的数据中 # # response = requests.get(‘https://github.com/settings/emails‘, # headers={ # ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘, # ‘Referer‘: ‘https://github.com/settings/profile‘ # }, # cookies=res_cookie, # # ) # github对请求头没有什么限制,我们无需定制user-agent,对于其他网站可能还需要定制 # # print(‘lich_qiu@163.com‘ in response.text) # True
# 编码问题 # import requests # response = requests.get(‘http://www.autohome.com/news‘) # # # 当前页面编码方式 # print(response.apparent_encoding) # # # 将编码方式改成gbk # response.encoding = ‘gbk‘ # print(response.text)
# 爬图片 # 比较小的文件,可以一次性把content爬下来 # import requests # res = requests.get(‘https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1556732811646&di=2bd8396b35047f33fbcd6b023f5787b7&imgtype=0&src=http%3A%2F%2Fs15.sinaimg.cn%2Fmw690%2F0066UWNtgy6Viz3mEBoce%26690‘) # # with open(‘a.jpg‘,‘wb‘) as f: # f.write(res.content)
# 爬视频 # 文件较大,有个iter_content()方法来循环爬 # import requests # # res = requests.get(‘http://static.yximgs.com/s1/videos/www_main-059ce9beee.mp4‘) # with open(‘a.mp4‘,‘wb‘)as f: # for i in res.iter_content(): # f.write(i)
#解析json # import requests # # response = requests.get(‘http://httpbin.org/get‘) # # import json # res1 = json.loads(response.text) # 太麻烦 # # res2 = response.json() #直接获取json数据 # # print(res1 == res2) #结果是True,结果一致
- 响应response print(respone.text) --- 输出文本的内容 print(respone.content) --- 输出二进制的内容 print(respone.status_code) --- 状态码 print(respone.headers) --- 响应头 print(respone.cookies) --- 返回的cookie print(respone.cookies.get_dict()) --- 把返回的cookie转成字典格式 print(respone.cookies.items()) --- 字典.items() print(respone.url) --- 要重定向的地址 print(respone.history) --- 正常返回的数据 print(respone.encoding) --- 返回数据的编码格式
二、requests模块高级用法
# 1 ssl cert verification # verify = False 不认证证书 # import requests #携带证书 # response = requests.get(‘https://www.12306.cn‘, # cert = (‘/path/server.crt/path/key‘))
# 2 使用代理 # http代理 # import requests # # proxies = { # ‘http‘:‘http://lich:123@112.85.151.216:9999‘, # 带用户密码的代理,@符号前面是用户名与密码 # ‘http‘:‘223.241.116.173:8010‘, # ‘https‘:‘https://localhost:8888‘ # } # # response = requests.get(‘https://www.12306.cn‘,proxies=proxies) # print(response.status_code)
# socket代理 # import requests # proxies = { # ‘http‘:‘socks5://lich:123@112.85.151.216:9999‘, # 带用户密码的代理,@符号前面是用户名与密码 # # ‘http‘:‘socks5://223.241.116.173:8010‘, # # ‘https‘:‘socks5://localhost:8888‘ # } # # response = requests.get(‘https://www.12306.cn‘,proxies=proxies) # # print(response.status_code)
# 超时设置 # import requests # response = requests.get(‘https://www.12306.cn‘,timeout = 0.0001)
# 上传文件 # import requests # files = { # ‘file‘:open(‘a.jpg‘,‘rb‘) # } # response = requests.post(‘http://httpbin.org/post‘,files = files) # print(response.status_code)
三、爬虫项目案例
# 单线程爬取 # import requests # import re # import os # # # 通用的通过地址获取页面内容的方法: # def get_page(url): # ret = requests.get(url) # if ret.status_code == 200: # return ret.text # # # def parse_res(text): # # <a href="video_1549859" class="vervideo-lilink actplay"> # urls = re.findall(r‘class="categoryem".*?href="(.*?)" ‘,text,re.S) # print(urls) # for url in urls: # print(url) # yield ‘https://www.pearvideo.com/‘ + url # # # def parse_detail(text): # # print(text) # movie_url = re.findall(‘srcUrl="(.*?)"‘,text,re.S)[0] # # print(‘视频文件的实际地址:‘,movie_url) # return movie_url # # def base_dir(): # base = os.path.dirname(os.path.abspath(__file__)) # return base # # # def download_movie(url): # import time # movie_content = requests.get(url) # file_name = str(time.time())+‘.mp4‘ # with open(‘%s/download/%s‘%(base_dir(),file_name),‘wb‘)as f: # f.write(movie_content.content) # # # # if __name__ == ‘__main__‘: # res = get_page(‘https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=6&start=1‘) # # res是这个页面的内容 # urls = parse_res(res) # for url in urls: # try: # res_detail = get_page(url) # movie_url = parse_detail(res_detail) # download_movie(movie_url) # except Exception as e: # print(e)
# 多线程 import requests import re import os from concurrent.futures import ThreadPoolExecutor # 先生成一个有60个线程的池 pool = ThreadPoolExecutor(60) # 通用的通过地址获取页面内容的方法: def get_page(url): ret = requests.get(url) if ret.status_code == 200: return ret.text def parse_res(text): #从text中取出上个函数执行的返回结果 text = text.result() urls = re.findall(r‘class="categoryem".*?href="(.*?)" ‘,text,re.S) print(urls) for url in urls: print(url) # yield ‘https://www.pearvideo.com/‘ + url pool.submit(get_page,‘https://www.pearvideo.com/‘+url).add_done_callback(parse_detail) def parse_detail(text): # print(text) text = text.result() movie_url = re.findall(‘srcUrl="(.*?)"‘,text,re.S)[0] # print(‘视频文件的实际地址:‘,movie_url) pool.submit(download_movie,movie_url) def base_dir(): base = os.path.dirname(os.path.abspath(__file__)) base = os.path.join(base,‘download‘) return base def download_movie(url): import time movie_content = requests.get(url) file_name = str(time.time())+‘.mp4‘ file = os.path.join(base_dir(),file_name) if movie_content.status_code == 200: with open(file,‘wb‘)as f: f.write(movie_content.content) if __name__ == ‘__main__‘: for i in range(3): url = ‘https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=6&start=%s‘%(i*12+1) # 把获取页面的方法提交到线程中 # add_done_callback()当该线程执行完成后,执行的方法 pool.submit(get_page,url).add_done_callback(parse_res)
#导入 beautifulsoup模块 from bs4 import BeautifulSoup import time import os # https://www.autohome.com.cn/news/2/#liststart for i in range(1,10): url = ‘https://www.autohome.com.cn/news/%s/#liststart‘%i ret = requests.get(url) # print(ret.text) # soup = BeautifulSoup(ret.text,‘lxml‘) soup = BeautifulSoup(ret.text,‘html.parser‘) ul = soup.find(name=‘ul‘,attrs={‘class‘:‘article‘}) li_list = ul.find_all(name=‘li‘) for li in li_list: try: # 取出新闻的url news_url = ‘https:‘ + li.find(name=‘a‘).get(‘href‘) #取出属性 news_title = li.find(name=‘h3‘).text #取出h3标签的文本 news_desc = li.find(name=‘p‘).text #取出新闻的简介 news_img = ‘https:‘ + li.find(name=‘img‘).get(‘src‘) #取到新闻的图片 print( ‘‘‘ 新闻标题:%s 新闻摘要:%s 新闻地址:%s 新闻图片地址:%s ‘‘‘%(news_title,news_desc,news_url,news_img) ) # 下载新闻的图片 response = requests.get(news_img) time_name = str(time.time()) + ‘.jpg‘ base_path = os.path.dirname(os.path.abspath(__file__)) download_path = os.path.join(base_path,‘download‘) file_name = os.path.join(download_path,time_name) with open(file_name,‘wb‘)as f: f.write(response.content) except Exception as e: print(e)
# import requests # ret = requests.get(‘https://dig.chouti.com‘, # headers={ # ‘user-agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘ # } # ) # # # print(ret.status_code) # print(ret.text) # 模拟登入,状态码:9999登入成功,并不能成功点赞 # ret = requests.post(‘https://dig.chouti.com/login‘, # headers={ # ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘ # # }, # data={ # ‘phone‘: ‘8618901847206‘, # ‘password‘: ‘123.abcd‘, # ‘oneMonth‘: 1 # } # ) # # print(ret.text) # # 取出登入成功后的cookie # cookie = ret.cookies.get_dict() # # # # 给文章点赞,向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求 # res = requests.post(‘https://dig.chouti.com/link/vote?linksId=25944651‘, # # headers={ # ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘, # ‘referer‘:‘https://dig.chouti.com/‘ # # }, # # cookies = cookie # ) # print(res.text)
# 第一步:先打开抽屉首页 import requests from bs4 import BeautifulSoup ret = requests.get(‘https://dig.chouti.com/‘, headers={ ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘, }, ) ret_cookie = ret.cookies.get_dict() # 第二步:模拟登入 res = requests.post(‘https://dig.chouti.com/login‘, headers={ ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘, ‘referer‘: ‘https://dig.chouti.com/‘ }, cookies=ret_cookie, data={ ‘phone‘: ‘8618901847206‘, ‘password‘: ‘123.abcd‘, ‘oneMonth‘: 1 } ) print(res.text) res_cookie = res.cookies.get_dict() # # 第三步:给文章点赞(这个写死了,用下面那个第三步) # # # 给文章点赞,向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求 # response = requests.post(‘https://dig.chouti.com/link/vote?linksId=25944651‘, # # headers={ # ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘, # ‘referer‘:‘https://dig.chouti.com/‘ # # }, # # cookies = ret_cookie # ) # print(response.text) # 登入成功第三步,进入首页 post_url_list = [] for i in range(5,10): response = requests.get(‘https://dig.chouti.com/all/hot/recent/%s‘%i, headers={ ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘, ‘referer‘: ‘https://dig.chouti.com/‘ } ) soup = BeautifulSoup(response.text, ‘html.parser‘) div_all = soup.find(name=‘div‘,attrs={‘class‘:‘content-list‘,‘id‘:‘content-list‘}) div_list = div_all.find_all(name=‘div‘,attrs={‘class‘:‘news-pic‘}) for div in div_list: try: news_id = div.find(name=‘img‘).get(‘lang‘) # ‘https://dig.chouti.com/link/vote?linksId=%s‘%news_id post_url = ‘https://dig.chouti.com/link/vote?linksId=%s‘%news_id post_url_list.append(post_url) except Exception as e: print(‘这里报错了哟‘,e) # print(post_url_list) # 第四步:循环给文章点赞 # 给文章点赞,向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求 for url in post_url_list: up_news = requests.post(url, headers={ ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘, ‘referer‘:‘https://dig.chouti.com/‘ }, cookies = ret_cookie ) print(up_news.text)
# 第一步:先打开抽屉首页 import requests from bs4 import BeautifulSoup ret = requests.get(‘https://dig.chouti.com/‘, headers={ ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘, }, ) ret_cookie = ret.cookies.get_dict() # 第二步:模拟登入 res = requests.post(‘https://dig.chouti.com/login‘, headers={ ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘, ‘referer‘: ‘https://dig.chouti.com/‘ }, cookies=ret_cookie, data={ ‘phone‘: ‘8618901847206‘, ‘password‘: ‘123.abcd‘, ‘oneMonth‘: 1 } ) print(res.text) res_cookie = res.cookies.get_dict() # 登入成功第三步,进入首页,获取新闻ID并拼接url加入列表 post_url_list = [] news_id_list = [] for i in range(5, 10): response = requests.get(‘https://dig.chouti.com/all/hot/recent/%s‘ % i, headers={ ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘, ‘referer‘: ‘https://dig.chouti.com/‘ } ) soup = BeautifulSoup(response.text, ‘html.parser‘) div_all = soup.find(name=‘div‘, attrs={‘class‘: ‘content-list‘, ‘id‘: ‘content-list‘}) div_list = div_all.find_all(name=‘div‘, attrs={‘class‘: ‘news-pic‘}) for div in div_list: try: news_id = div.find(name=‘img‘).get(‘lang‘) news_id_list.append(news_id) # ‘https://dig.chouti.com/link/vote?linksId=%s‘%news_id post_url = ‘https://dig.chouti.com/link/vote?linksId=%s‘ % news_id post_url_list.append(post_url) except Exception as e: print(‘这里报错了哟‘, e) # 第四步:循环给文章取消点赞 # 给文章取消点赞,向https://dig.chouti.com/vote/cancel/vote.do 发送post请求,并携带form_data= {linksId: 25933276} url = ‘https://dig.chouti.com/vote/cancel/vote.do‘ for news_id in news_id_list: up_news = requests.post(url, headers={ ‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘, ‘referer‘: ‘https://dig.chouti.com/‘ }, cookies=ret_cookie, data={ ‘linksId‘: news_id } ) print(up_news.text)
四、bs4模块介绍
# html_doc = """ # <html><head><title>The Dormouse‘s story</title></head> # <body> # <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse‘s story</b></p> # # <p class="story">Once upon a time there were three little sisters; and their names were # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; # and they lived at the bottom of a well.</p> # # <p class="story">...</p> # """ #1、用法 # from bs4 import BeautifulSoup # soup=BeautifulSoup(html_doc,‘lxml‘) # # soup=BeautifulSoup(open(‘a.html‘),‘lxml‘) # # print(soup.p) #存在多个相同的标签则只返回第一个 # print(soup.p.b.text) #The Dormouse‘s story # print(soup.p.b.get(‘class‘)) #[‘boldest‘] # print(soup.a) #存在多个相同的标签则只返回第一个 # # #2、获取标签的名称 # print(soup.p.name) # # #3、获取标签的属性 # print(soup.p.attrs) # # #4、获取标签的内容 # print(soup.p.string) # p下的文本只有一个时,取到,否则为None # print(soup.p.strings) #拿到一个生成器对象, 取到p下所有的文本内容 # print(soup.p.text) #取到p下所有的文本内容 # for line in soup.stripped_strings: #去掉空白 # print(line) # # # ‘‘‘ # 如果tag包含了多个子节点,tag就无法确定 .string 方法应该调用哪个子节点的内容, .string 的输出结果是 None,如果只有一个子节点那么就输出该子节点的文本,比如下面的这种结构,soup.p.string 返回为None,但soup.p.strings就可以找到所有文本 # <p id=‘list-1‘> # 哈哈哈哈 # <a class=‘sss‘> # <span> # <h1>aaaa</h1> # </span> # </a> # <b>bbbbb</b> # </p> # ‘‘‘ # # #5、嵌套选择 # print(soup.head.title.string) # print(soup.body.a.string) # # # #6、子节点、子孙节点 # print(soup.p.contents) #p下所有子节点 # print(soup.p.children) #得到一个迭代器,包含p下所有子节点 # # for i,child in enumerate(soup.p.children): # print(i,child) # # print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来 # for i,child in enumerate(soup.p.descendants): # print(i,child) # # #7、父节点、祖先节点 # print(soup.a.parent) #获取a标签的父节点 # print(soup.a.parents) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲... # # # #8、兄弟节点 # print(‘=====>‘) # print(soup.a.next_sibling) #下一个兄弟 # print(soup.a.previous_sibling) #上一个兄弟 # # print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象 # print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象
html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 五种过滤器: 字符串,正则表达式, 列表, True, 方法 # soup.find() #find(self, name=None, attrs={}, recursive=True, text=None,**kwargs): # name:标签名, attrs:属性, text:文本内容, recursive=False表示不递归查找、默认是True, **kwargs from bs4 import BeautifulSoup # 1 字符串精准匹配 # soup = BeautifulSoup(html_doc,‘lxml‘) # # ret = soup.find(name=‘body‘) #指定条件1 # # ret = soup.find(attrs={‘class‘:‘title‘}) #指定条件2 # ret = soup.find(text="The Dormouse‘s story") #指定条件3 # print(ret) # print(type(ret)) # 2 正则表达式匹配 # import re # soup = BeautifulSoup(html_doc,‘lxml‘) # # ret = soup.find(name=re.compile(‘^p‘)) # # ret = soup.find(attrs={‘class‘:re.compile(‘^s‘)}) # ret = soup.find(name=‘a‘,text=re.compile(‘^L‘)) # print(ret) # 3 列表匹配 soup = BeautifulSoup(html_doc,‘lxml‘) # ret =soup.find_all(name=[‘a‘,‘b‘]) # ret =soup.find_all(attrs={‘class‘:[‘title‘,‘sister‘]}) ret =soup.find_all(text=[‘Elsie‘,‘Lacie‘]) print(ret)
五、selenium模块介绍
# 最基本的用法 # from selenium import webdriver # import time # # # #webdriver.Chrome()得到一个对象,相当于我的浏览器 # browser = webdriver.Chrome() # browser.get(‘https://www.baidu.com‘) # print(browser.page_source) # # time.sleep(2) # # #关闭浏览器(务必) # browser.close() # # from selenium import webdriver # import time
#### 所有选择器用法 # 1、find_element_by_id 通过id查找 # 2、find_element_by_link_text 通过连接文字 # 3、find_element_by_partial_link_text 通过连接文字模糊查找 # 4、find_element_by_tag_name 通过标签查找 # 5、find_element_by_class_name 通过类名查找 # 6、find_element_by_name # 7、find_element_by_css_selector # 8、find_element_by_xpath #### 所有选择器用用法
#简单使用1 打开百度,在百度搜索栏中搜索美女关键字 # try: # browser = webdriver.Chrome() # browser.get(‘https://www.baidu.com‘) # # time.sleep(2) # # search_input = browser.find_element_by_id(‘kw‘) # key = input(‘请输入要搜索的内容‘) # search_input.send_keys(key) # time.sleep(5) # # # # # except Exception as e: # print(e) # finally: # browser.close()
# 简单使用2 打开百度 并完成登入 # try: # browser = webdriver.Chrome() # # 表示取控件的时候,如果取不到,则等待3秒 # # 隐式等待 # browser.implicitly_wait(3) # browser.get(‘https://www.baidu.com‘) # # time.sleep(2) # # login_btn = browser.find_element_by_link_text(‘登录‘) # login_btn.click() # user_login = browser.find_element_by_id(‘TANGRAM__PSP_10__footerULoginBtn‘) # user_login.click() # username_input = browser.find_element_by_id(‘TANGRAM__PSP_10__userName‘) # username_input.send_keys(‘13681878977‘) # password_input = browser.find_element_by_id(‘TANGRAM__PSP_10__password‘) # password_input.send_keys(‘zhang319!‘) # submit_btn = browser.find_element_by_id(‘TANGRAM__PSP_10__submit‘) # submit_btn.click() # time.sleep(5) # # search_input = browser.find_element_by_id(‘kw‘) # search_input.send_keys(‘名侦探柯南‘) # time.sleep(10) # # # except Exception as e: # print(e) # finally: # browser.close()
# 简单使用3 爬取京东商品信息 # from selenium import webdriver # from selenium.webdriver.common.keys import Keys # import time # # # def get_goods(browser): # li_list = browser.find_elements_by_class_name(‘gl-item‘) # for li in li_list: # goods_price = li.find_element_by_css_selector(‘.p-price i‘).text # # print(goods_price) # # goods_comment = li.find_element_by_css_selector(‘.p-commit strong a‘).text # # print(goods_comment) # # goods_name = li.find_element_by_css_selector(‘.p-name-type-2 a‘).get_attribute(‘title‘) # # print(goods_name) # # goods_url = li.find_element_by_css_selector(‘.p-name-type-2 a‘).get_attribute(‘href‘) # # goods_img = li.find_element_by_css_selector(‘.p-img a img‘).get_attribute(‘src‘) # if not goods_img: # goods_img = ‘https:‘+li.find_element_by_css_selector(‘.p-img a img‘).get_attribute(‘data-lazy-img‘) # # print( # ‘‘‘ # 商品名称:%s # 商品价格:%s # 商品评论数:%s # 商品详情地址:%s # 商品图片地址:%s # ‘‘‘% (goods_name, goods_price, goods_comment, goods_url, goods_img) # ) # # next_page = browser.find_element_by_partial_link_text(‘下一页‘) # time.sleep(2) # next_page.click() # get_goods(browser) # # # # # # # # # def spider(): # try: # browser = webdriver.Chrome() # browser.implicitly_wait(3) # # browser.get(‘https://www.jd.com‘) # # search_input = browser.find_element_by_id(‘key‘) # search_input.send_keys(‘手机‘) # search_input.send_keys(Keys.ENTER) # time.sleep(5) # # # #取出页面中商品信息 # get_goods(browser) # # # except Exception as e: # print(e) # finally: # browser.close() # # if __name__ == ‘__main__‘: # spider()
# 简单实用4 模拟浏览器前进后退 # import time # from selenium import webdriver # # browser=webdriver.Chrome() # browser.get(‘https://www.baidu.com‘) # browser.get(‘https://www.taobao.com‘) # browser.get(‘http://www.sina.com.cn/‘) # # browser.back() # time.sleep(10) # browser.forward() # browser.close() # 执行js代码 # from selenium import webdriver # import time # # browser=webdriver.Chrome() # browser.get(‘https://www.baidu.com‘) # browser.execute_script(‘alert(1234)‘) # #执行后js代码后,.close()不能生效 # browser.close() #选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键:ctrl+t等,最通用的就是js的方式 # import time # from selenium import webdriver # # browser=webdriver.Chrome() # browser.get(‘https://www.baidu.com‘) # browser.execute_script(‘window.open()‘) # # print(browser.window_handles) #获取所有的选项卡 # browser.switch_to_window(browser.window_handles[1]) # browser.get(‘https://www.taobao.com‘) # time.sleep(5) # browser.switch_to_window(browser.window_handles[0]) # browser.get(‘https://www.sina.com.cn‘) # browser.close() # 控制鼠标滑动 # from selenium import webdriver # from selenium.webdriver import ActionChains # from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR # from selenium.webdriver.common.keys import Keys # 键盘按键操作 # from selenium.webdriver.support import expected_conditions as EC # from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 # import time # # driver = webdriver.Chrome() # driver.get(‘http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable‘) # wait=WebDriverWait(driver,3) # # driver.implicitly_wait(3) # 使用隐式等待 # # try: # driver.switch_to.frame(‘iframeResult‘) ##切换到iframeResult # sourse=driver.find_element_by_id(‘draggable‘) # target=driver.find_element_by_id(‘droppable‘) # # #方式一:基于同一个动作链串行执行 # # actions=ActionChains(driver) #拿到动作链对象 # # actions.drag_and_drop(sourse,target) #把动作放到动作链中,准备串行执行 # # actions.perform() # # #方式二:不同的动作链,每次移动的位移都不同 # ActionChains(driver).click_and_hold(sourse).perform() # distance=target.location[‘x‘]-sourse.location[‘x‘] # # track=0 # while track < distance: # ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform() # track+=2 # # ActionChains(driver).release().perform() # # time.sleep(10) # # finally: # driver.close()
六、微信自动回复机器人
# 好友性别饼状图 # from wxpy import * # from pyecharts import Pie # # import webbrowser # bot = Bot(cache_path=True) # 注意手机登陆确认 # # # 拿到所有朋友对象,放到列表里 # friends = bot.friends() # # attr = [‘男朋友‘,‘女朋友‘,‘未知性别‘] # value = [0,0,0] # # for friend in friends: # if friend.sex == 1: #1代表男性 # value[0] +=1 # elif friend.sex == 2: #2代表女性 # value[1] += 1 # else: # value[2] += 1 # # pie = Pie(‘朋友男女比例‘) # pie.add("",attr, value, is_label_show=True) # # # 图表名称str , 属性名称list , 属性所对应的值list , is_label_show是否现在标签 # pie.render(‘sex.html‘) # # #打开浏览器 # from selenium import webdriver # import time # browser = webdriver.Chrome() # browser.get(‘/Users/lich/PycharmProjects/w3spider_Proj/sex.html‘) # time.sleep(10) # browser.close()
# 好友地域省份分布图 # from wxpy import * # from pyecharts import Map # from pyecharts import Pie # import webbrowser # bot = Bot(cache_path=True) # 注意手机登陆确认 # # # 拿到所有朋友对象,放到列表里 # friends = bot.friends() # # area_dic = {} #定义一个省份字典 # # for friend in friends: # if friend.province not in area_dic: # area_dic[friend.province] = 1 # else: # area_dic[friend.province] += 1 # attr = area_dic.keys() # value = area_dic.values() # # map = Map(‘好朋友们的地域分布‘, width= 1200,height=600) # map.add( # "好友地域分布", # attr, # value, # maptype=‘china‘, # is_visualmap = True, #结合体VisualMap # # ) # # # is_visualmap -> bool 是否使用视觉映射组件 # map.render(‘area.html‘)
# 全好友自动回复: # from wxpy import * # bot=Bot(cache_path=True) # # @bot.register() # def recv_send_msg(recv_msg): # print(‘收到的消息:‘,recv_msg.text) # recv_msg.text取得文本 # return ‘自动回复:%s‘ %recv_msg.text # # # 进入Python命令行,让程序保持运行 # embed()
#自动给老婆回复信息 # from wxpy import * # bot=Bot(cache_path=True) # # girl_friend=bot.search(‘老婆‘)[0] # print(girl_friend) # # @bot.register() # 接收从指定好友发来的消息,发送者即recv_msg.sender为指定好友girl_friend # def recv_send_msg(recv_msg): # # print(‘收到的消息:‘,recv_msg.text) # recv_msg.text取得文本 # if recv_msg.sender == girl_friend: # recv_msg.forward(bot.file_helper,prefix=‘老婆留言: ‘) #在文件传输助手里留一份,方便自己忙完了回头查看 # ms=‘老婆最美丽,我对老婆的爱如滔滔江水,连绵不绝‘ # print(‘>>>给老婆回复的:‘, ms) # return ms#给老婆回一份 # # embed()
#使用图灵机器人自动回复 import json import requests from wxpy import * bot = Bot(cache_path=True) # 调用图灵机器人API,发送消息并获得机器人的回复 def auto_reply(text): url = "http://www.tuling123.com/openapi/api" api_key = "9df516a74fc443769b233b01e8536a42" payload = { "key": api_key, "info": text, } r = requests.post(url, data=json.dumps(payload)) result = json.loads(r.content) # return "[来自智能机器人] " + result["text"] return result["text"] girl_friend=bot.search(‘老婆‘)[0] @bot.register() def forward_message(msg): if msg.sender == girl_friend: return auto_reply(msg.text) embed()
原文:https://www.cnblogs.com/lich1x/p/10803918.html