import requests
s = requests.Session()
# 登录表 (打开开发者工具, 登录网站, 然后在开发者工具中找到POST网址及其登录表)
form = {
‘username‘: ‘‘,
‘password‘: ‘‘,
}
s.post(url, form) # POST网址及登录表
from bs4 import BeautifulSoup
r = s.get(url) # 待爬网页
r.encoding = ‘gbk‘ # 网页编码
soup = BeautifulSoup(r.text, ‘lxml‘)
# find示例
title = soup.find(‘h1‘).text
# find_all示例
dls = soup.find_all(‘dl‘, class_ = ‘attachlist‘)
for dl in dls:
filename = dl.dt.a.text
fileUrl = baseUrl + dl.dt.a.get(‘href‘)
def download(url, s, filename):
import urllib, os
# filename = urllib.parse.unquote(url)
# filename = filename[filename.rfind(‘/‘) + 1:]
try:
r = s.get(url, stream=True, timeout = 2)
chunk_size = 1000
timer = 0
length = int(r.headers[‘Content-Length‘])
print(‘Downloading {}‘.format(filename))
if os.path.isfile(‘./‘ + filename):
print(‘ File already exist, skipped‘)
return False
with open(‘./‘ + filename, ‘wb‘) as f:
for chunk in r.iter_content(chunk_size):
timer += chunk_size
percent = round(timer/length, 4) * 100
print(‘\r {:4f}‘.format((percent)), end = ‘‘)
f.write(chunk)
print(‘\r Finished ‘)
return True
except requests.exceptions.ReadTimeout:
print(‘Read time out, this file failed to download‘)
return False
except requests.exceptions.ConnectionError:
print(‘ConnectionError, this file failed to download‘)
return False
// XPath查询
var threads = document.evaluate("//span[contains(@id, ‘thread‘)]/a", document, null,
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null)
// 用户代码
for (var i = 0; i < threads.snapshotLength; ++i) {
thread = threads.snapshotItem(i)
threadUrl = baseUrl + thread.attributes.href.textContent
threadTitle = thread.text
console.log(‘%s (%s)‘, threadTitle, threadUrl)
}
var parser = new DOMParser()
var xhr = new XMLHttpRequest()
xhr.overrideMimeType("text/html;charset=gbk") // 网页编码
xhr.onload = function(e) {
if (xhr.readyState === 4) {
if (xhr.status === 200) {
// 解析response为document
var threadDoc = parser.parseFromString(xhr.response, ‘text/html‘)
// XPath查询
var files = document.evaluate("//dl[@class=‘t_attachlist‘]/dt/a", threadDoc, null,
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null)
// 用户代码
} else {
console.error(xhr.statusText)
}
}
}
xhr.onerror = function(e) {
console.error(xhr.statusText)
}
xhr.open(‘GET‘, threadUrl, false) // GET子网页, false表示同步
xhr.send(null)
function download(url, filename) {
var a = document.createElement(‘a‘)
var e = document.createEvent(‘MouseEvents‘)
e.initEvent(‘click‘, false, false)
a.download = filename
a.href = url
a.dispatchEvent(e)
}
安装油猴插件, 并将代码保存为用户脚本, 注意添加例如
// @include *www.baidu.com/*
确保脚本正常运行后, 打开开发者工具 - Console, 之后打开待爬网页即可.
原文:https://www.cnblogs.com/maoruimas/p/13254362.html