python

关注公众号 jb51net

关闭
首页 > 脚本专栏 > python > BeautifulSoup 网页

python 中的 BeautifulSoup 网页使用方法解析

作者:autofelix

这篇文章主要介绍了python 中的 BeautifulSoup 网页使用方法解析,文章基于python的相关资料展开详细内容介绍,具有一定的参考价值需要的小伙伴可以参考一下

一、安装

pip install bs4
pip install lxml
pip install html5lib

二、html.parser解析

html = '''
<div class="modal-dialog">
<div class="modal-content">
<div class="modal-header">
<button type="button" class="close" data-dismiss="modal">&times;</button>
<h4 class="modal-title">Modal title</h4>
</div>
<div class="modal-body">
...
</div>
<div class="modal-footer">
<a href="#" rel="external nofollow"  rel="external nofollow"  class="btn btn-default" data-dismiss="modal">Close</a>
<a href="#" rel="external nofollow"  rel="external nofollow"  class="btn btn-primary">Save</a>
</div>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
#prettify()用于格式化输出html/xml文档
print(soup.prettify())

三、外部文档解析

from bs4 import BeautifulSoup
fp = open('html_doc.html', encoding='utf8')
soup = BeautifulSoup(fp, 'lxml')

四、标签选择器

from bs4 import BeautifulSoup
soup = BeautifulSoup('<p class="name nickname user"><b>i am autofelix</b></p>', 'html.parser')
#获取整个p标签的html代码
print(soup.p)
#获取b标签
print(soup.p.b)
#获取p标签内容,使用NavigableString类中的string、text、get_text()
print(soup.p.text)
#返回一个字典,里面是多有属性和值
print(soup.p.attrs)
#查看返回的数据类型
print(type(soup.p))
#根据属性,获取标签的属性值,返回值为列表
print(soup.p['class'])
#给class属性赋值,此时属性值由列表转换为字符串
soup.p['class']=['Web','Site']
print(soup.p)

五、css选择器

html = """
<html>
<head>
<title>零基础学编程</title>
</head>
<body>
<p class="intro"><b>i am autofelix</b></p>
<p class="nickname">飞兔小哥</p>
<a href="https://autofelix.blog.csdn.net" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="csdn">csdn主页</a>
<a href="https://xie.infoq.cn/u/autofelix/publish" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="infoq">infoq主页</a>
<a href="https://blog.51cto.com/autofelix" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="51cto">51cto主页</a>
<p class="attention">跪求关注 一键三连</p>
<p class="introduce">
<a href="https://www.cnblogs.com/autofelix" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="cnblogs">博客园主页</a>
</p>
</body>
</html>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
#根据元素标签查找
print(soup.select('nickname'))
#根据属性选择器查找
print(soup.select('a[href]'))
#根据类查找
print(soup.select('.attention'))
#后代节点查找
print(soup.select('html head title'))
#查找兄弟节点
print(soup.select('p + a'))
#根据id选择p标签的兄弟节点
print(soup.select('p ~ #csdn'))
#nth-of-type(n)选择器,用于匹配同类型中的第n个同级兄弟元素
print(soup.select('p ~ a:nth-of-type(1)'))
#查找子节点
print(soup.select('p > a'))
print(soup.select('.introduce > #cnblogs'))

六、节点遍历

html = """
<html>
<head>
<title>零基础学编程</title>
</head>
<body>
<p class="intro"><b>i am autofelix</b></p>
<p class="nickname">飞兔小哥</p>
<a href="https://autofelix.blog.csdn.net" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="csdn">csdn主页</a>
<a href="https://xie.infoq.cn/u/autofelix/publish" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="infoq">infoq主页</a>
<a href="https://blog.51cto.com/autofelix" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="51cto">51cto主页</a>
<p class="attention">跪求关注 一键三连</p>
<p class="introduce">
<a href="https://www.cnblogs.com/autofelix" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="cnblogs">博客园主页</a>
</p>
</body>
</html>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
body_tag=soup.body
print(body_tag)
# 以列表的形式输出,所有子节点
print(body_tag.contents)
# children 用来遍历子节点
for child in body_tag.children:
print(child)

七、find_all方法

html = """
<html>
<head>
<title>零基础学编程</title>
</head>
<body>
<p class="intro"><b>i am autofelix</b></p>
<p class="nickname">飞兔小哥</p>
<a href="https://autofelix.blog.csdn.net" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="csdn">csdn主页</a>
<a href="https://xie.infoq.cn/u/autofelix/publish" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="infoq">infoq主页</a>
<a href="https://blog.51cto.com/autofelix" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="51cto">51cto主页</a>
<p class="attention">跪求关注 一键三连</p>
<p class="introduce">
<a href="https://www.cnblogs.com/autofelix" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="cnblogs">博客园主页</a>
</p>
</body>
</html>
"""
import re
from bs4 import BeautifulSoup
# 创建soup解析对象
soup = BeautifulSoup(html, 'html.parser')
# 查找所有a标签并返回
print(soup.find_all("a"))
# 查找前两条a标签并返回,只返回两条a标签
print(soup.find_all("a",limit=2))
# 按照标签属性以及属性值查找
print(soup.find_all("p",class_="nickname"))
print(soup.find_all(id="infoq"))
# 列表行书查找tag标签
print(soup.find_all(['b','a']))
# 正则表达式匹配id属性值
print(soup.find_all('a',id=re.compile(r'.\d')))
print(soup.find_all(id=True))
# True可以匹配任何值,下面代码会查找所有tag,并返回相应的tag名称
for tag in soup.find_all(True):
print(tag.name,end=" ")
# 输出所有以b开始的tag标签
for tag in soup.find_all(re.compile("^b")):
print(tag.name)
# 简化前写法
soup.find_all("a")
# 简化后写法
soup("a")

八、find方法

html = """
<html>
<head>
  <title>零基础学编程</title>
</head>
<body>
  <p class="intro"><b>i am autofelix</b></p>
  <p class="nickname">飞兔小哥</p>
  <a href="https://autofelix.blog.csdn.net" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="csdn">csdn主页</a>
  <a href="https://xie.infoq.cn/u/autofelix/publish" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="infoq">infoq主页</a>
  <a href="https://blog.51cto.com/autofelix" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="51cto">51cto主页</a>
  <p class="attention">跪求关注 一键三连</p>
  <p class="introduce">
    <a href="https://www.cnblogs.com/autofelix" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  id="cnblogs">博客园主页</a>
  </p>
</body>
</html>
"""
import re
from bs4 import BeautifulSoup
# 创建soup解析对象
soup = BeautifulSoup(html, 'html.parser')
# 查找第一个a并直接返回结果
print(soup.find('a'))
# 查找title
print(soup.find('intro'))
# 匹配指定href属性的a标签
print(soup.find('a',href='https://autofelix.blog.csdn.net'))
# 根据属性值正则匹配
print(soup.find(class_=re.compile('tro')))
# attrs参数值
print(soup.find(attrs={'class': 'introduce'}))
# 使用 find 时,如果没有找到查询标签会返回 None,而 find_all 方法返回空列表
print(soup.find('aa'))
print(soup.find_all('bb'))
# 简化写法
print(soup.head.title)
# 上面代码等价于
print(soup.find("head").find("title"))

到此这篇关于python 中的 BeautifulSoup 网页解析的文章就介绍到这了,更多相关BeautifulSoup 网页内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家!

您可能感兴趣的文章:
阅读全文