#-*- coding:utf-8 -*-
#读取北京FDA的药品经营企业数据
# 20161125 zhangshaohua
import re
import urllib.request
import urllib.parse
import os
def getContent(url,pat,charSet):
#指定网址、正则表达式、编码方式,返回指定内容
page = urllib.request.urlopen(url)
content = page.read().decode(charSet)
pattern = re.compile(pat)
result = re.findall(pattern,content)
return result
#读取首页
url = ‘http://www.bjda.gov.cn/eportal/ui?pageId=331148‘
#取总记录数,每页20条
zjls = getContent(url,‘总记录数:(\d{1,5}),‘,‘UTF-8‘)
vdzjls = int(zjls[0])
vdzjls = int(round(vdzjls/20,0))
for i in range(51,vdzjls):
url = ‘http://www.bjda.gov.cn/eportal/ui?pageId=331148¤tPage=‘+str(i)
pattern = ‘artileId=(.*)">查看‘
page_id = getContent(url,pattern,‘UTF-8‘)
for url_id in page_id:
try:
subid = url_id
suburl = "http://www.bjda.gov.cn/eportal/ui?pageId=331631&artileId="+subid
qymc = getContent(suburl,‘企业名称:</th>\r\n.*?<td>(.*?)</td>‘,‘UTF-8‘)
zcdz = getContent(suburl,‘注册地址:</th>\r\n.*?<td>(.*?)\s{0,3}</td>‘,‘UTF-8‘)
xkzh = getContent(suburl,‘许可证号:</th>\r\n.*?<td>(.*?)</td>‘,‘UTF-8‘)
print(qymc,zcdz,xkzh)
file_object = open(‘bjda.txt‘,‘a‘)
file_object.write(qymc[0])
file_object.write(‘,‘)
file_object.write(zcdz[0])
file_object.write(‘,‘)
file_object.write(xkzh[0])
file_object.write(‘\n\r‘)
finally:
None
file_object.close()
vdzjls = int(zjls[0])
print(‘药品零售企业读取完成!‘)
经历了读取HDA的练习,此次读取BJ的数据开始比较顺畅。在读取996条数据时出错,再次出现换行造成的问题;
多次试错后用‘\s{0,3}’成功解决.
正则表达式要继续学习,才能不断进步,避免遇“”坑“”时能顺利通过!
原文:http://www.cnblogs.com/lrzy/p/6109226.html