码迷,mamicode.com
首页 > 编程语言 > 详细

python BeautifulSoup4解析网页

时间:2019-05-26 13:20:15      阅读:172      评论:0      收藏:0      [点我收藏+]

标签:属性   arc   soup   att   time   turn   ref   false   orm   

html = """
<html><head><title>The Dormouse‘s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse‘s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.</p>
<p class="story">...</p></body></html>
"""

soup=BS(html,html.parser)

for i in soup.find_all(a):
    print(i.text:,i.text)#注释掉的内容就不打印了  str类型
    print(i.string:,i.string)  #注释掉的内容 都会打印出来,NavigableString对象


print(soup.head.contents:,soup.head.contents,type(soup.head.contents))
print(soup.head.children:,soup.head.children,type(soup.head.children))

print(soup.body.contents:,soup.body.contents)#返回一个子元素的列表
print(soup.body.children:,soup.body.children)#返回一个子元素的迭代器

for i in soup.body.children:
    print(i)

print(子孙节点 都显示出来)
for i in soup.body.descendants:
    print(i)

print(soup.body.string:,soup.body.string)
print(soup.body.strings:,soup.body.strings)
print(soup.body.stripped_strings:,soup.body.stripped_strings)  #过滤掉所有空格显示

print(去掉空格的body子元素:)
for i  in soup.body.stripped_strings:
    print(i)


print(soup.a.parent:,soup.a.parent)
print(soup.a.next_sibling:,soup.a.next_sibling)  #注意文本节点、换行\n都可能成为当前节点的上一个或者下一个同级节点
print(soup.a.previous_sibling:,soup.a.previous_sibling)
print(soup.a.next_element:,soup.a.next_element)  #下一个元素 不一定同级
print(soup.a.previous_element:,soup.a.previous_element)

print(打印所有后面的同级节点:\n)
for i in soup.a.next_siblings:
    print(i)

print(soup.a.next_element:,list(soup.a.next_elements)[1])


print(***********find_all*****)

print(soup.find_all(a))

print(引入正则表达式:)

import re
print(soup.find_all(re.compile(r^title)))  #正则匹配的是 标签的名字

print(列表的方式匹配:)
print(soup.find_all([a,b]))

print(函数的方式匹配,类似filter)
def func(tag):
    if tag.has_attr(class) and re.search(r^a,tag.name):
        return tag

print(soup.find_all(func))


html = """
<html><head><title>The Dormouse‘s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse‘s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.</p>
<p class="story">...</p></body></html>
"""

soup=BS(html,html.parser)

print(按属性值查找:)
print(soup.find_all(id=link1))
print(soup.find_all(a,id=link1))

print(soup.find_all(id=link2,href=re.compile(rlaci)))  #返回的都是列表
print(soup.find_all(class_=story)) #注意后面加的下划线
print(soup.find_all(attrs={class:sister}))

print(按元素内容查找text参数:)
print(soup.find_all(text=Tillie))
print(soup.find_all(text=[Tillie,Lacie]))  #返回的都是元素内容
print(soup.find_all(text=re.compile(rormous)))

print(通过内容元素 找到上级元素)
print(soup.find_all(text=re.compile(rormous))[1].parent.parent)

#限制查找数量
print(limit:)
print(soup.find_all(a,limit=2))

print(只在子节点查找:)
print(soup.body.find_all(a,limit=2,recursive=False))  #只查找子节点 recursive循环的、递归的
print(soup.body.find_all(class_=story,recursive=False))

 

python BeautifulSoup4解析网页

标签:属性   arc   soup   att   time   turn   ref   false   orm   

原文地址:https://www.cnblogs.com/xiaoxiao075/p/10925489.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!