# pip install BeautifulSoup4 from bs4 import BeautifulSoup import requests import json, time, datetime import csv, codecs def getUrlText(url): #根据url获取html资源,返回html文本 while True: try: html = requests.get(url) html.encoding = ‘utf-8‘ html = html.text break except requests.exceptions.ConnectionError: print(‘ConnectionError -- please wait 3 seconds‘) time.sleep(3) except requests.exceptions.ChunkedEncodingError: print(‘ChunkedEncodingError -- please wait 3 seconds‘) time.sleep(3) except: print(‘Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds‘) time.sleep(3) return html def test(): url = ‘http://www.sxkszx.cn/news/201989/n378377624.html‘ html = getUrlText(url) # print(html) soup = BeautifulSoup(html, features="lxml") table = soup.table f = codecs.open("1.csv", ‘wb‘, "gbk") w = csv.writer(f) w.writerow([‘院校代码‘,‘院校名称‘,‘科类‘,‘计划性质‘,‘最低分‘]) for idx, tr in enumerate(table.select(‘tr‘)): if idx != 0: tds = tr.select(‘td‘) if(tds[0].text.strip().isdigit()):
f.close() if __name__ == "__main__": test()
原文:https://www.cnblogs.com/liuyong0076/p/12524130.html