首页 > 编程语言 > 详细

python爬虫

时间:2020-02-29 15:11:16      阅读:69      评论:0      收藏:0      [点我收藏+]

一篇文章带你了解《python爬虫》

一 什么是网络爬虫:

       1. 通俗理解:爬虫是一个模拟人类请求网站行为的程序。可以自动请求网页、并数据抓取下来,然后使用一定的规则提取有价值的数据。

  2. 专业介绍:百度百科

二 python urllib:

# demo01.py(urillb基本使用)

# 导入urllib库(该库不需要安装)
import urllib.request
# 请求百度,并接收响应
response = urllib.request.urlopen("http://www.baidu.com/")
# 打印页面
print(response.read().decode(utf-8))

# demo2.py(用法讲解)

# urllib 用法讲解
# urlopen : urllib.request.urlopen(‘网址‘,‘数据‘,‘超时设置‘)

import urllib.request
import urllib.parse
import urllib.error

"""
A:
response = urllib.request.urlopen(‘http://www.baidu.com/‘)
print(response.read().decode(‘utf-8‘))

B:
data = urllib.parse.urlencode({‘word‘: ‘hello‘}).encode(‘utf-8‘)
response = urllib.request.urlopen("http://httpbin.org/post", data = data)
print(response.read())

C:
response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)
print(response.read())
"""

try:
    response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason.socket.timeout):
        print(response.read())

# demo03.py(响应)

# urllib 响应

import urllib.request
response = urllib.request.urlopen("http://www.baidu.com/")
# 打印响应类型
print(type(response))
# 打印状态码
print(response.status)
# 打印响应头
print(response.getheaders())

# demo04.py(Request 详解

# Request 详解

import urllib.request
from urllib import parse

"""
A:
request = urllib.request.Request(‘http://www.baidu.com‘)
response = urllib.request.urlopen(request)
print(response.read().decode(‘utf-8‘))

B:
url = "http://httpbin.org/post"
# 指定请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
    "Host": "api.github.com"
}
# 请求数据
dict = {
    "name":"Germey"
}
data = bytes(parse.urlencode(dict),encoding=‘utf-8‘)
request = urllib.request.Request(url=url,data=data,headers=headers,method=‘POST‘)
response = urllib.request.urlopen(request)
print(response.read().decode(‘utf-8‘))
"""

url = "http://httpbin.org/post"
# 请求数据
dict = {
    "name":"Germey"
}
data = bytes(parse.urlencode(dict),encoding=utf-8)
request = urllib.request.Request(url=url,data=data,method=POST)
request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36")
response = urllib.request.urlopen(request)
print(response.read().decode(utf-8))

# demo05.py (代理)

# handler(代理)
import urllib.request

proxy_header = urllib.request.ProxyHandler({
    "http":"http://xxx.xxx.xxx.xxx:xxxx",
    "https":"https://xxx.xxx.xxx.xxx:xxxx"
})
opener = urllib.request.build_opener(proxy_header)
response = opener.open(http://www.baidu.com)
print(response.read().decode(utf-8))

# demo06.py(cookie)

# cookie

import http.cookiejar
import urllib.request

"""
A: http.cookiejar 简单使用
cookie = http.cookiejar.CookieJar()
handir = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handir)
response = opener.open(‘http://www.baidu.com‘)
print(response.read().decode(‘utf-8‘))

B:MozillaCookieJar 将网站的cookie存储在本地文件中
filename = "utils/cookie.txt"
cookie = http.cookiejar.MozillaCookieJar(filename)
handir = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handir)
response = opener.open(‘http://www.baidu.com‘)
cookie.save(ignore_discard=True,ignore_expires=True)

C: LWPCookieJar 将网站的cookie存储在本地文件中
filename = "utils/cookie01.txt"
cookie = http.cookiejar.LWPCookieJar(filename)
handir = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handir)
response = opener.open(‘http://www.baidu.com‘)
cookie.save(ignore_discard=True,ignore_expires=True)

D: 使用文件中的cookie
"""
cookie = http.cookiejar.LWPCookieJar()
cookie.load(utils/cookie01.txt,ignore_discard=True,ignore_expires=True)
handir = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handir)
response = opener.open(http://www.baidu.com)
print(response.read().decode(utf-8))

# demo07.py(异常处理)

# 异常处理

import urllib.request
from urllib import error

"""
A: urllib error 简单使用
try:
    response = urllib.request.urlopen(‘http://www.baidu.com‘)
except error.URLError as e:
    print(e.reason)
    
B:
try:
    response = urllib.request.urlopen(‘http://www.baidu.com/‘)
    print(response.read().decode(‘utf-8‘))
except error.URLError as e:
    print(e.reason)
else:
    print("*************")
    
C: timeout
try:
    response = urllib.request.urlopen(‘http://www.baidu.com‘,timeout=0.01)
except error.URLError as e:
    print(e.reason)
"""

# 一个不存在的连接
try:
    response = urllib.request.urlopen("http://www.abcdhaha2.com/")
    html = response.read().decode(utf-8)
    print(html)
except error.URLError as e:
    print(e.reason)

# demo08.py(URL解析)

from urllib.parse import urlparse
from urllib.parse import urlunparse
from urllib.parse import urljoin
from urllib.parse import urlencode

# 语法:urlparse("网址",scheme=‘http|https‘, allow_fragments=True)

# A
resuit = urlparse(https://www.baidu.com/index.html;user?id=5#comment)
print(type(resuit))
print(resuit)

# B
resuit = urlparse(www.baidu.com/index.html;user?id=5#comment, scheme="https")
print(resuit)

# C
resuit = urlparse(https://www.baidu.com/index.html;user?id=5#comment, allow_fragments=True)
print(resuit)

# D
resuit = urlparse(https://www.baidu.com/index.html;user?id=5#comment, allow_fragments=False)
print(resuit)

# E
resuit = urlparse(https://www.baidu.com/index.html#comment, allow_fragments=False)
print(resuit)

# F (urlunparse)
data = ["http", "www.baidu.com", "index.html", "user", "a=6", "comment"]
print(urlunparse(data))

# G (urljoin)
# 语法 : urljoin("网址","要添加的后缀")
print(urljoin("https://www.cnblogs.com/xingxingnbsp/p/xxxxxxxxx.html", "12129466.html"))

# H (urlencode)
params = {
    name: hello_urllib,
    age: 18
}
base_url = http://www.baidu.com?
url = base_url + urlencode(params)
print(url)

三 python requests:

1. 安装 requests 库:pip install requests

# demo01.py

# requests 基本使用

import requests

response = requests.get("http://www.baidu.com")
print(type(response))           # 打印响应类型
print(response.status_code)     # 打印状态码
print(type(response.text))      # 打印响应内容类型
print(response.text)            # 打印响应内容
print(response.cookies)         # 打印响应cookie

2. 请求方式:

1 requests.get(网址)
2 requests.post(网址)
3 requests.put(网址)
4 requests.patch(网址)
5 requests.delete(网址)
6 requests.head(网址)
7 requests.options(网址)

3. 基本get请求:

# demo02.py

import requests

"""
A:
response = requests.get(‘http://www.baidu.com‘)
print(response.text)

B:
response = requests.get(‘http://httpbin.org/get?name=hello&age=22‘)
print(response.text)
"""

data = {
    "name":"hello",
    "age":22
}
response = requests.get(http://httpbin.org/get,params=data)
print(response.text)

4. 解析json:

# demo03.py

# 解析json

import requests
response = requests.get(https://api.jinse.com/v6/www/information/list?catelogue_key=news&limit=23&information_id=18762945&flag=down&version=9.9.9&_source=www)
print(type(response))
print(response.json())
print(type(response.json()))

5. 获取二进制数据

# demo04.py

import requests

"""
A:
response = requests.get(‘https://images.pexels.com/photos/3393793/pexels-photo-3393793.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500‘)
print(type(response.text))
print(type(response.content))
print(response.text)
print(response.content)
"""
response = requests.get(https://images.pexels.com/photos/3393793/pexels-photo-3393793.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500)
with open(images/image.png,wb) as f:
    f.write(response.content)
    f.close()

6. 添加headers:

# demo05.py

import requests

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
}
response = requests.get("http://www.baidu.com",headers=headers)
print(response.text)

7. 基本的post请求

# demo06.py

import requests

"""
A:
data = {
    "name":"hello",
    "age":22
}
response = requests.post("http://httpbin.org/post",data=data)
print(response.text)
"""

data = {
    "name":"hello",
    "age":22
}
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
}
response = requests.post("http://httpbin.org/post",data=data,headers=headers)
print(response.text)

8. 响应:(response属性)

# demo07.py

import requests

response = requests.get(http://www.baidu.com)
print(type(response.status_code),response.status_code)      # 打印响应 状态码类型 和 状态码
print(type(response.headers),response.headers)              # 打印响应 头类型 和 响应头
print(type(response.cookies),response.cookies)              # 打印响应 cookies类型 和 cookies
print(type(response.url),response.url)                      # 打印响应 URL类型 和 URL
print(type(response.history),response.history)              # 打印历史记录

9. 状态码判断:

# demo08.py

import requests

"""
A:
response = requests.get(‘http://www.baidu.com‘)
# 这里使用了python三元表达式
exit() if not response.status_code == requests.codes.ok else print(‘request successfully‘)

B:
response = requests.get(‘http://www.baidu.com‘)
# 这里使用了python三元表达式
exit() if not response.status_code == 200 else print(‘request successfully‘)

"""
response = requests.get(http://www.baidu.com)
if not response.status_code == 200:
    exit()
else:
    print(request successfully)

# 以上三种方式表达的意思是一样的

10. 高级操作:

# demo09.py

import requests

# A: 上传文件 ----------------------------------------------------------------
files = {
    "files":open(images/image.png,rb)
}
response = requests.post(http://www.baidu.com,files=files)
print(response.text)

# B:获取cookie -------------------------------------------------------------
response = requests.get(http://www.baidu.com)
print(response.cookies)
for key,value in response.cookies.items():
    print(key + "=" + value)
    
# C: 会话维持 --------------------------------------------------------------
requests.get(http://httpbin.org/cookie/set/number/123456789)
response = requests.get(http://httpbin.org/cookkie)
print(response.text)

s = requests.session()
s.get(http://httpbin.org/cookie/set/number/123456789)
response = s.get(http://httpbin.org/cookkie)
print(response.text)

# D: 代理设置 --------------------------------------------------------------
# 方式一:
proxies = {
    http:http://ip:port,
    https:https://ip:port
}
response = requests.get(http://www.baidu.com,proxies=proxies)
print(response.status_code)

# 方式二:
proxies = {
    http:http://user:password@ip:port/,
    https:https://user:password@ip:port/
}
response = requests.get(http://www.baidu.com,proxies=proxies)
print(response.status_code)

# 方式三:
proxies = {
    http:socks5://ip:port,
    https:socks5://ip:port
}
response = requests.get(http://www.baidu.com,proxies=proxies)
print(response.status_code)

# E: 证书认证 ----------------------------------------------------------------
response = requests.get(http://www.12306.cn)
print(response.status_code)

response = requests.get(http://www.12306.cn,verify=False)
print(response.status_code)

# 注意这里的路径 ‘path/server.crt‘,‘path/key‘ 该成自己的
response = requests.get(http://www.12306.cn,cert=(path/server.crt,path/key))
print(response.status_code)

# F:超时设置 ----------------------------------------------------------------
from requests.exceptions import ReadTimeout
try:
    response = requests.get(http://www.taobao.com, timeout=0.1)
    print(response.status_code)
except ReadTimeout:
    print("Timeout")
    
# G: 认证管理 ----------------------------------------------------------------
from requests.auth import HTTPBasicAuth
response = requests.get(http://www.taobao.com, auth=HTTPBasicAuth(user,123))
print(response.status_code)

response = requests.get(http://www.taobao.com, auth=(user,123))
print(response.status_code)

# H: 异常处理 ----------------------------------------------------------------
from requests.exceptions import ReadTimeout,ConnectionError,HTTPError,RequestException
try:
    response = requests.get(http://www.taobao.com, timeout=0.1)
    print(response.status_code)
except ReadTimeout:
    print("Timeout")
except HTTPError:
    print("HTTPError")
except ConnectionError:
    print("ConnectionError")
except RequestException:
    print("Error")

四 BautifulSoup库详解:(网页解析器)

1. 安装 :pip install bautifusoup4

2. BautifulSoup基本用法:

# demo01.py

# BeautifulSoup 的基本使用
from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
    <h2>这是一个列表</h2>
    <ul>
        <li>选项1</li>
        <li>选项2</li>
        <li>选项3</li>
        <li>选项4</li>
        <li>选项5</li>
        <li>选项6</li>
        <li>选项7</li>
        <li>选项8</li>
        <li>选项9</li>
    </ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html,lxml)
print(soup.prettify())
print(soup.title.string)

3. 标签选择器:(只能拿一次)

# demo02.py

# BeautifulSoup 标签选择器(只拿一次)
from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
    <h2>这是一个列表</h2>
    <ul>
        <li>选项1</li>
        <li>选项2</li>
        <li>选项3</li>
        <li>选项4</li>
        <li>选项5</li>
        <li>选项6</li>
        <li>选项7</li>
        <li>选项8</li>
        <li>选项9</li>
    </ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html,lxml)
print(soup.title)
print(type(soup.title))
print(soup.head)
print(soup.li)

4. 获取标签名称:

# demo03.py

# BeautifulSoup 获取标签名称
from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
</body>
</html>
"""
soup = BeautifulSoup(html,lxml)
print(soup.title.name)

5. 获取标签属性:

# demo04.py

# BeautifulSoup 获取标签属性
from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<p class="font-p"></p>
<a href="http://www.baidu.com">百度一下 你就知道</a>
</body>
</html>
"""
soup = BeautifulSoup(html,lxml)
print(soup.p.attrs)
print(soup.p.attrs["class"])
print(soup.a.attrs["href"])

6. 获取内容:

# demo05.py

# BeautifulSoup 获取标签属性
from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
div
<a href="http://www.baidu.com">百度一下 你就知道</a>
</body>
</html>
"""
soup = BeautifulSoup(html,lxml)
print(soup.p.string)
print(soup.a.string)

7. 嵌套选择:

 # demo06.py

# 嵌套选择
from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
    <h2>这是一个列表</h2>
    <ul>
        <li>选项1</li>
    </ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html,lxml)
print(soup.ul.li.string)

8. 子节点和孙节点:

 # demo07.py

# 子节点和孙节点
from bs4 import BeautifulSoup

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
    <h2>这是一个列表</h2>
    <ul><li>选项1</li><li>选项2</li><li><a href="http://www.baidu.com">百度一下 你就知道</a></li></ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html, lxml)
print(soup.ul.contents) # 选择所有子节点 返回值为列表类型
print(soup.ul.childern) # 选择单个子节点
print(soup.ul.descendants)  # 获取所有子孙节点
for i,child in enumerate(soup.ul.descendants):
    print(i,child)

9. 父节点和祖先节点:

 # demo08.py

# 父节点和祖先节点

from bs4 import BeautifulSoup

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
<div>
    <ol>
        <li><a href="http://www.baidu.com">百度一下 你就知道</a></li>
    </ol>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html,lxml)
print(soup.a.parent)    # 选择父节点
print(type(soup.a.parents)) # 选择所有父节点
print(list(enumerate(soup.a.parents)))

10.兄弟节点:

# demo09.py

# 兄弟节点
from bs4 import BeautifulSoup

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
    <h1>我是一个大大的H1</h1>
    <h2>我是一个大大的H2</h2>
    <p>我是一个简单的p标签</p>
    <h3>我是一个大大的H3</h3>
    <h4>我是一个大大的H4</h4>
</div>
</body>
</html>
"""
html = html.replace(\n,‘‘).replace( ,‘‘)   # 去掉html代码的 "\n" 和 空格
soup = BeautifulSoup(html, lxml)
print(list(enumerate(soup.p.next_siblings)))    # 获取当前加点下所有的兄弟节点
print(list(enumerate(soup.p.previous_siblings)))    # 获取当前加点上所有的兄弟节点

11. 标准选择器(***重点***)

 # demo10.py

 

from bs4 import BeautifulSoup

# 标准选择器(重点 建议反复观看)
# 语法:find_all(name,attrs,recursive,text,**kwargs)
"""
find 返回符合条件的单个元素 find_all 返回所有符合条件的所有元素
    1. find_parent()          # 返回直接父节点
    2. find_parents()         # 获取所有祖先节点
    3. find_next_sibling()    # 返回当前节点后边一个兄弟节点
    4. find_next_siblings()   # 返回当前节点后边所有兄弟节点
    5. find_all_next()        # 返回当前节点后所有符合条件的节点
    6. find_next()            # 返回当前节点后第一个符合条件的节点
    7. find_all_previous()    # 返回当前节点后所有符合条件的节点
    8. find_previous()        # 返回当前节点后第一个符合条件的节点
"""

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>这是一个列表</h2>
    <ul id="list-1">
        <li class="zhangsan">选项1</li>
        <li class="zhangsan">选项2</li>
        <li class="zhangsan">选项3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">选项1</li>
        <li class="lisi">选项2</li>
        <li class="lisi">选项3</li>
    </ul>
</div>
</body>
</html>
"""


# A:name --------------------------------------------------------------
soup = BeautifulSoup(html, lxml)
print(soup.find_all(ul))  # 获取所有ul标签 返回列表类型
print(type(soup.find_all(ul)[0])) # 获取类型
for ul in soup.find_all(ul): 
    print(ul.find_all(li))

# B:attrs -------------------------------------------------------------
# 方式一:
soup = BeautifulSoup(html, lxml)
print(soup.find_all(attrs={"id":"list-1"})) # 获取 id 为 list-1 的所有元素
print(soup.find_all(attrs={"class":"lisi"}))    # 获取 class 为 lisi 的所有元素
# 方式二:
print(soup.find_all(id = "list-1")) # 获取 id 为 list-1 的所有元素
print(soup.find_all(class_ = "lisi"))   # 获取 class 为 lisi 的所有元素
# 以上两种方式执行结果是一样的

# C:text --------------------------------------------------------------
soup = BeautifulSoup(html, lxml)
print(soup.find_all(text = "选项1"))

# D:css选择器(***) -----------------------------------------------------
# 1:
soup = BeautifulSoup(html, lxml)
print(soup.select(#list-2))       # ID 选择器
print(soup.select(.zhangsan))     # class 选择器
print(soup.select(ul li))         # 标签选择器
print(soup.select(#divid h2))     # ID 和 标签 共同使用

# 2:
soup = BeautifulSoup(html, lxml)
for ul in soup.select(ul):
    print(ul.select(li))
    
# 3:属性选择器
soup = BeautifulSoup(html, lxml)
for ul in soup.select(ul):
    print(ul.get(id))
    print(ul[id])

# 4:获取内容
soup = BeautifulSoup(html, lxml)
for li in soup.select(li):
    print(li.get_text())

 

 

 

五 pyquery 库详解

1. 安装: pip install pyquery

2. 初始化:

# demo01.py

# 初始化
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>这是一个列表</h2>
    <ul id="list-1">
        <li class="zhangsan">选项1</li>
        <li class="zhangsan">选项2</li>
        <li class="zhangsan">选项3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">选项1</li>
        <li class="lisi">选项2</li>
        <li class="lisi">选项3</li>
    </ul>
</div>
</body>
</html>
"""

# A: 字符串初始化 -------------------------------------------------------------------------------------------------------
doc = PyQuery(html)
print(doc(li))

# B: URL初始化 ----------------------------------------------------------------------------------------------------------
doc = PyQuery(url="http://www.baidu.com")
print(doc(head))

# C: 文件初始化(在同级目录下创建index.html 代码和上边的一样) ---------------------------------------------------------------
# 这种方法会报错 :UnicodeDecodeError: ‘gbk‘ codec can‘t decode byte 0x80 in position 187: illegal multibyte sequence
# 解决方法去掉html文件中的中文字符,这种解决方式不推荐(有待研究)
# doc = PyQuery(filename=‘index.html‘)
# print(doc(‘li‘))

# 可以改成这种方法(但是,总感觉有问题)
with open("index.html","r",encoding="utf-8")as f:
    doc = f.read()
result = PyQuery(doc)
print(result(li))

3. 基本CSS选择器:

# demo02.py

# 基本CSS选择器
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>这是一个列表</h2>
    <ul id="list-1">
        <li class="zhangsan">选项1</li>
        <li class="zhangsan">选项2</li>
        <li class="zhangsan">选项3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">选项1</li>
        <li class="lisi">选项2</li>
        <li class="lisi">选项3</li>
    </ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
print(doc(#divid #list-1 li))

 

4. 查找元素:

A: 子元素

# demo03.py

# 子元素
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>这是一个列表</h2>
    <ul id="list-1">
        <li class="zhangsan">选项1</li>
        <li class="zhangsan">选项2</li>
        <li class="zhangsan">选项3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">选项1</li>
        <li class="lisi">选项2</li>
        <li class="lisi">选项3</li>
    </ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
items = doc(#list-1)
print(type(items))
print(items)
li_list = items.find(li)
print(type(li_list))
print(li_list)

 

B: 父元素

# demo04.py

 

# 父元素
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>这是一个列表</h2>
    <ul id="list-1">
        <li class="zhangsan">选项1</li>
        <li class="zhangsan">选项2</li>
        <li class="zhangsan">选项3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">选项1</li>
        <li class="lisi">选项2</li>
        <li class="lisi">选项3</li>
    </ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
items = doc(#list-1)
container = items.parent()
print(type(container))
print(container)
parents = items.parents()
print(type(parents))
print(parents)

 

 

 

 

C: 兄弟元素

# demo05.py

 

# 兄弟元素
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>这是一个列表</h2>
    <ul id="list-1">
        <li class="zhangsan">选项1</li>
        <li class="zhangsan">选项2</li>
        <li class="zhangsan">选项3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">选项1</li>
        <li class="lisi">选项2</li>
        <li class="lisi">选项3</li>
    </ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
lis = doc(#list-1 .zhangsan)
print(lis.siblings())
print(lis.siblings(.zhangsan))

 

 

 

D: 遍历

# demo06.py

 

# 遍历
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>这是一个列表</h2>
    <ul id="list-1">
        <li class="zhangsan">选项1</li>
        <li class="zhangsan">选项2</li>
        <li class="zhangsan">选项3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">选项1</li>
        <li class="lisi">选项2</li>
        <li class="lisi">选项3</li>
    </ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
lis = doc(#list-2 .lisi)
print(lis)
li_list = doc(.lisi).items()
print(type(li_list))
for li in li_list:
    print(li)

 

 

 

E: 获取信息(标签属性)

# demo07.py

# 获取信息(获取属性)
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <a href="http://www.baidu.com">百度一下 你就知道</a>
</div>
</body>
</html>
"""
doc = PyQuery(html)
a = doc(#divid a)
print(a)
print(a.attr(href))
print(a.attr.href)

 

F: 获取文本

# demo08.py

# 获取文本
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <a href="http://www.baidu.com">百度一下 你就知道</a>
</div>
</body>
</html>
"""
doc = PyQuery(html)
a = doc(#divid a)
print(a)
print(a.text())

 

G: 获取html

# demo09.py

# 获取html
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <a href="http://www.baidu.com">百度一下 你就知道</a>
</div>
</body>
</html>
"""
doc = PyQuery(html)
div = doc(#divid)
print(div)
print(div.html())

 

H: DOM操作

# demo10.py

# DOM 操作
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
div id="divid">
    <h2>这是一个列表</h2>
    <ul id="list-1">
        <li class="zhangsan">选项1</li>
        <li class="zhangsan">选项2</li>
        <li class="zhangsan">选项3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">选项1</li>
        <li class="lisi">选项1</li>
        <li class="lisi">选项1</li>
    </ul>
</div>
</body>
</html>
"""

# 1. addClass,removeClass ----------------------------------------------------------------------------------------------
doc = PyQuery(html)
li = doc(.lisi)
print(li)
li.remove_class(lisi)
print(li)
li.add_class(zhangsan)
print(li)

# 2. attr,css ----------------------------------------------------------------------------------------------------------
doc = PyQuery(html)
li = doc(.zhangsan)
print(li)
li.attr(name,link)
print(li)
li.css(font-size,40px)
print(li)

# 3. remove ------------------------------------------------------------------------------------------------------------
doc = PyQuery(html)
div = doc(#divid)
print(div.text())
div = doc.find(h2).remove()
print(div.text())

# 4. 伪类选择器 ---------------------------------------------------------------------------------------------------------
doc = PyQuery(html)
li = doc(.zhangsan:first-child)       # 获取列表的第一个选项
print(li)
li = doc(.zhangsan:last-child)        # 获取列表的最后一个选项
print(li)
li = doc(.zhangsan:nth-child(2))      # 获取列表的第二个选项
print(li)
li = doc(.zhangsan:gt(0))             # 获取索引大于0的所有选项
print(li)
li = doc(.zhangsan:nth-child(1n))     # 获取第一个之后的所有选项(包括第一个选项)
print(li)
li = doc(.zhangsan:contains(选项3))    # 过去内容为"选项3"的选项
print(li)

 

六 selenium库详解(自动化测试工具)

selenium 在爬虫中主要用来解决JavaScrapt渲染问题

1. 安装:pip install selenium

2. 基本使用:

# demo01.py

 

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

"""
项目目标:实现百度搜索
1. 创建浏览器对象 请求百度
2. 元素定位输入框
3. 输入搜索内容
4. 点击回车
"""
# 创建浏览器对象(我用的是谷歌浏览器)
browser = webdriver.Chrome()
try:
    # 请求百度
    browser.get("http://www.baidu.com")
    # 定位输入框
    input = browser.find_element_by_id(kw)
    # 输入搜索内容
    input.send_keys("selenium")
    # 点击回车
    input.send_keys(Keys.ENTER)
    # 打印当前的url地址
    print(browser.current_url)
    # 打印cookies
    print(browser.get_cookies())
    # 打印页面
    print(browser.page_source)
except Exception as e:
    print(e,"=============================")
finally:
    browser.close()

"""
有可能会遇到的错误
1. selenium.common.exceptions.WebDriverException: Message: ‘chromedriver‘ executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home
    这是由于程序找不到 chromedriver 驱动
解决:
    下载 chromedriver (http://chromedriver.storage.googleapis.com/index.html)
    注意版本:版本对照表 (https://blog.csdn.net/BinGISer/article/details/88559532)

2. selenium.common.exceptions.SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 78
    这是由于 ChromeDriver 和 Chrome 版本不对应
解决:
    删除之前下载的 chromedriver
    重新下载 chromedriver (http://chromedriver.storage.googleapis.com/index.html)
    注意版本:版本对照表 (https://blog.csdn.net/BinGISer/article/details/88559532)
    
大功告成
"""

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

python爬虫

原文:https://www.cnblogs.com/xingxingnbsp/p/12129466.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!