正则表达式获取贴吧访问量
import urllib.request
import re
data=urllib.request.urlopen("https://tieba.baidu.com/f?kw=cpda&fr=ala0&tpl=5").read()
data2=data.decode("utf-8","ignore")
pat="<title>(.*?)</title>"
s1 = re.compile(pat).findall(str(data2))
print(s1)
pat2=‘<span class="card_numLabel">(.*?)</span>‘
s2 = re.compile(pat2).findall(str(data2))
print(s2)
pat3=‘<span class="card_menNum">(.*?)</span>‘
s3 = re.compile(pat3).findall(str(data2))
print(s3)
pat4=‘<span class="card_infoNum">(.*?)</span>‘
s4 = re.compile(pat4).findall(str(data2))
print(s4)
正则表达式学习2--豆瓣获取文章
import urllib.request
import re
file=urllib.request.urlopen("https://read.douban.com/provider/all").read()
file2=file.decode("utf-8","ignore")
patn=‘<div class="name">(.*?)</div>‘
mydata=re.compile(patn).findall(str(file2))
print(mydata)
for i in range(0,len(mydata)):
print(mydata[i]+"\n")
url数据获取--异常值处理--新浪新闻获取文章
import urllib.request
import re
data=urllib.request.urlopen("http://news.sina.com.cn/").read()
data2=data.decode("utf-8","ignore")
pat=‘href="(http://news.sina.com.cn/.*?)"‘
allurl=re.compile(pat).findall(data2)
for i in range(0,len(allurl)):
try:
print("第"+str(i)+"次爬取")
thisurl=allurl[i]
print(thisurl)
file="D:/sinanews/"+str(i)+".html"
print(file)
print("-------成功-------")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
import urllib.requestimport refile=urllib.request.urlopen("https://read.douban.com/provider/all").read()file2=file.decode("utf-8","ignore")patn=‘<div class="name">(.*?)</div>‘mydata=re.compile(patn).findall(str(file2))print(mydata)for i in range(0,len(mydata)): print(mydata[i]+"\n")
原文:https://www.cnblogs.com/wei23/p/10887432.html