首页 > 其他 > 详细

BeautifulSoup4 库的基本使用

时间:2018-09-12 10:07:55      阅读:186      评论:0      收藏:0      [点我收藏+]

  喜欢我的博客可以加关注,有问题可以提问我。

  1.基本使用(下面的html由于过长就不复制了都复用第一个)

html="""
<html>
<head><title>dsojfeoifjosieofiej</title></head>
    
    <meta http-equiv="content-type" content="text/html;charset=utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=Edge">
    <meta content="always" name="referrer">
    <meta name="theme-color" content="#2932e1">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon" />
    <link rel="search" type="application/opensearchdescription+xml" href="/content-search.xml" title="百度搜索" />
    <link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu_85beaf5496f291521eb75ba38eacbd87.svg">
    <link rel="dns-prefetch" href="//s1.bdstatic.com"/>
    <link rel="dns-prefetch" href="//t11.baidu.com"/>
    <link rel="dns-prefetch" href="//t12.baidu.com"/>
    <link rel="dns-prefetch" href="//b1.bdstatic.com"/>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.prettify())
print(soup.title.string)

  2.选择元素

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.title)
print(soup.head)
print(soup.p)(只输出第一个)

  3.获取名称

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.p.name)

  4.获取属性

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.p.attrs[name])
print(soup.p[name])

  5.获取内容

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.p.string)

  6.嵌套选择

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.head.title.string)

  7.子节点和子孙节点

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.p.contents)#(子节点)

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.p.children)
for i,child in enumerate(soup.p.children):
    print(i,child)#(子节点)

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.p.desccendants)
for i,child in enumerate(soup.p.desccendants):
    print(i,child)#(子孙节点)

  8.父节点和祖先节点

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.a.parent)#(父节点)

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(list(enumerate(soup.a.parents)))#(祖先节点)

  9.兄弟节点

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(list(enumerate(soup.a.next_siblings)))
print(list(enumerate(soup.a.previous_siblings)))

  10.标准选择器

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
for ul in soup.find_all(ul):
    print(ul.find_all(li))

  10.1加参数

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.find_all(attrs={id:list-1}))
print(soup.find_all(attrs={name:elements}))


from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.find_all(id=list-1))
print(soup.find_all(class_=elements))

  10.2text

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.find_all(text=Foo))#(返回内容)

  10.3 find(返回单个元素就是第一个元素)

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.find(ul))
print(type(soup.find(ul)))
print(soup.find(page))

  10.4 find_parents() find_parent()(这里和上面的类似就不粘贴代码了)

  10.5 find_next_siblings() find_next_sibling()(这里和上面的类似就不粘贴代码了)

  11. CSS 选择器


from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
print(soup.select(.panel .panel-heading))#(选择class 为.panel 下的class 为。panel0heading的标签)
print(soup.select(ul li))#(选择标签ul 下的li标签)
print(soup.select(#list-2 .element))#(选择id为list-2 下的class为 element标签)
print(type(soup.select(ul)[0]))

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
for ul in soup.select(ul):
    print(ul.select(ul))

  11.1 获取属性

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
for ul in soup.select(ul):
    print(ul[id])
    print(ul.attrs[id])

  11.2 获取内容

from bs4 import BeautifulSoup
soup=BeautifulSoup(html,lxml)
for li in soup.select(li):
    print(li.get_text())

BeautifulSoup4 库的基本使用

原文:https://www.cnblogs.com/zll20153246/p/9632756.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!