from copy import copy import requests import re from bs4 import BeautifulSoup import urllib.request import ssl import DBUtils import xlwt import xlrd from xlutils.copy import copy ssl._create_default_https_context = ssl._create_unverified_context def getContent(): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", ‘Connection‘: ‘keep-alive‘ } url = "http://top.baidu.com/boards?fr=topindex" ##请求对象(url+请求头) r = requests.get(url, headers=headers) #乱码 r.encoding = ‘GB2312‘ ##获取页面内容 page = BeautifulSoup(r.text, "html.parser") return page def yin(li): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", ‘Connection‘: ‘keep-alive‘ } url = "http://top.baidu.com%s"%li ##请求对象(url+请求头) r = requests.get(url, headers=headers) #乱码 r.encoding = ‘GB2312‘ ##获取页面内容 pa = BeautifulSoup(r.text, "html.parser") return pa #返回的五个界面连接 def list(): soup = getContent() a = soup.find("div",attrs={"class":"links"}) list = re.findall(r‘href="(.+?)"‘,str(a)) list.pop() return list #返回五个类别的名字 def lei(): soup = getContent() v = soup.find("div",attrs={"class":"links"}) le = v.find_all("a") lei = [] for k in le: o = k.string lei.append(o) lei.pop() return lei #添加数据 x是页数,bie是类别 def inp(li,x,bie): cou = yin(li) a = cou.find_all("a",attrs={"class":"list-title"}) b = cou.find_all("td",attrs={"class":"last"}) namelist = [] fenlist = [] for i in a: q = i.string namelist.append(q) for w in b: h = w.find("span") e = h.string fenlist.append(e) conn = DBUtils.getConnect() cursor = conn.cursor() for f in range(0,len(namelist)): sql = "Insert into fengyun (name,cat,inde)values(‘%s‘,‘%s‘,%s);"%(namelist[f],bie,fenlist[f]) DBUtils.insertData(sql,cursor,conn) DBUtils.closeConnect(cursor,conn) wb = xlrd.open_workbook("风云.xls") # 复制一份工作薄,用来写入 copyWb = copy(wb) # 通过索引获取表 sheet = copyWb.get_sheet(0) for (i, tuple) in enumerate(namelist): sheet.write(i + 1 + x * 50, 1, tuple) sheet.write(i + 1 + x * 50, 2, bie) sheet.write(i + 1 + x * 50, 3, fenlist[i]) sheet.write(i + 1 + x * 50, 0,i + 1 * x) # 保存,如果文件名和之前一样,覆盖 # 文件名不存在:新的文件 copyWb.save("风云.xls") #创建工作薄 def saveExcel(): wb = xlwt.Workbook() sheet = wb.add_sheet("百度风云") header = ["顺序", "电影名", "类别", "评分"] for (i,v) in enumerate(header): sheet.write(0,i,v) wb.save("风云.xls") # li = "./buzz?b=338" # inp(li) # print(lei()) li = list() lei = lei() saveExcel() x = 0 for p in range(0,len(li)): inp(li[p],x,lei[p]) x += 1 print("完成")
import pymysql.cursors #获取连接 def getConnect(): conn = pymysql.connect(host="", user="root", password="123", database="pymysql", charset="utf8") return conn #关闭连接 def closeConnect(cursor,conn): if cursor: cursor.close() if conn: conn.close() def insertData(sql,cursor,conn): cursor.execute(sql) conn.commit()
原文:https://www.cnblogs.com/dpdd/p/14851824.html