1.常用指令
import re #提取出python # key="javapythonc++php" # re.findall(‘python‘,key)[0] # #提取出hello world # key="<html><h1>hello world<h1></html>" # re.findall(‘<h1>(.*?)<h1>‘,key)[0] # #提取170 # string = ‘我喜欢身高为170的女孩‘ # re.findall(‘\d+‘,string)[0] # #提取出http://和https:// # key=‘http://www.baidu.com and https://boob.com‘ # re.findall(‘https?://‘,key) # #提取出hello # key=‘lalala<hTml>hello</HtMl>hahah‘ #输出<hTml>hello</HtMl> # #提取出hit. key=‘bobo@hit.edu.com‘#想要匹配到hit. re.findall(‘h.*?\.‘,key) # #匹配sas和saas # key=‘saas and sas and saaas‘ # #匹配出i开头的行 # string = ‘‘‘fall in love with you # i love you very much # i love she # i love her‘‘‘ # re.findall(‘^i.*‘,string,re.M) # #匹配全部行 string1 = """<div>细思极恐 你的队友在看书 你的闺蜜在减肥 你的敌人在磨刀 隔壁老王在炼药 </div>""" re.findall(‘.*‘,string1,re.S)
#解析糗事百科糗图下所有的图片数据
import re
import requests
from urllib import request
import os
#1.检查页面数据是否为动态加载出来的
#2.获取页面源码数据
if not os.path.exists(‘qiutu‘):
    os.mkdir(‘qiutu‘)
    
headers = {
    ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36‘
}
url = ‘https://www.qiushibaike.com/pic/‘
page_text = requests.get(url=url,headers=headers).text
#3.解析img标签的src属性值
ex = ‘<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>‘
img_url_list = re.findall(ex,page_text,re.S)
for img_url in img_url_list:
    img_url = ‘https:‘+img_url
    imgPath = ‘qiutu/‘+img_url.split(‘/‘)[-1]
    #4.对图片url发请求
    #5.持久化存储
    request.urlretrieve(url=img_url,filename=imgPath)
    print(imgPath+‘下载成功!!!‘)
pip install lxml
1.简历模板爬取
import requests
import os
from lxml import etree
import random
headers = {
    ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36‘
}
url = ‘http://sc.chinaz.com/jianli/free.html‘
response = requests.get(url=url,headers=headers)
response.encoding = ‘utf-8‘
page_text = response.text
if not os.path.exists(‘jianli‘):
    os.mkdir(‘jianli‘)
tree = etree.HTML(page_text)
div_list = tree.xpath(‘//div[@id="container"]/div‘)
for div in div_list:
    detail_url = div.xpath(‘./a/@href‘)[0]
    name = div.xpath(‘./a/img/@alt‘)[0]
    
    detail_page_text = requests.get(url=detail_url,headers=headers).text
    tree = etree.HTML(detail_page_text)
    download_url_list = tree.xpath(‘//div[@class="clearfix mt20 downlist"]/ul/li/a/@href‘)
    download_url = random.choice(download_url_list)
    
    jianli_data = requests.get(url=download_url,headers=headers).content
    
    file_path = ‘jianli/‘+name+‘.rar‘
    with open(file_path,‘wb‘) as fp:
        fp.write(jianli_data)
    print(file_path+‘下载成功‘)
######处理多页
import requests
import os
from lxml import etree
import random
headers = {
    ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36‘,
    ‘Connection‘:‘close‘
}
start_page = 1
end_page = 5
if not os.path.exists(‘jianli‘):
        os.mkdir(‘jianli‘)
        
url = ‘http://sc.chinaz.com/jianli/free_%d.html‘
for page in range(start_page,end_page+1):
    if page == 1:
        new_url = ‘http://sc.chinaz.com/jianli/free.html‘
    else:
        new_url = format(url%page)
    response = requests.get(url=new_url,headers=headers)
    response.encoding = ‘utf-8‘
    page_text = response.text
    tree = etree.HTML(page_text)
    div_list = tree.xpath(‘//div[@id="container"]/div‘)
    for div in div_list:
        detail_url = div.xpath(‘./a/@href‘)[0]
        name = div.xpath(‘./a/img/@alt‘)[0]
        detail_page_text = requests.get(url=detail_url,headers=headers).text
        tree = etree.HTML(detail_page_text)
        download_url_list = tree.xpath(‘//div[@class="clearfix mt20 downlist"]/ul/li/a/@href‘)
        download_url = random.choice(download_url_list)
        jianli_data = requests.get(url=download_url,headers=headers).content
        file_path = ‘jianli/‘+name+‘.rar‘
        with open(file_path,‘wb‘) as fp:
            fp.write(jianli_data)
        print(file_path+‘下载成功‘)
原文:https://www.cnblogs.com/TodayWind/p/13767810.html