首页 > 其他好文 > 详细

BeautifulSoup 库

时间:2018-06-24 10:29:51      阅读:145      评论:0      收藏:0      [点我收藏+]

标签:Once   nbsp   auth   import   amp   ref   none   rom   get   

#!Usr/bin/env python
# -*- coding:utf-8 -*-
# Author:Alex  Li
html = """
<html><head><title>The Dormouse‘s story</title></head>
<p class="title" name="dromouse"><b>The Dormouse‘s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
from bs4 import BeautifulSoup
import re
import urllib.request
import requests
soup = BeautifulSoup(html)
print(soup.prettify())     #以网页格式输出
print(soup.title.string)  #获得title标签内的内容
print(type(soup.title))  #返回<class ‘bs4.element.Tag‘> 这种格式可以进行嵌套
print(soup.title)   #输出title标签带内容<title>The Dormouse‘s story</title>
print(soup.li) #没有返回None
html = """
<html><head><title>The Dormouse‘s story</title></head>
<p class="title" name="dromouse"><b>The Dormouse‘s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
print(soup.title.name)  #返回title标签的名称   title

html = """
<html><head><title>The Dormouse‘s story</title></head>
<p class="title" name="dromouse"><b>The Dormouse‘s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
print(soup.p.attrs[name])  #两种格式输出P标签name=‘dromouse‘
print(soup.a[href])  #返回http://example.com/elsie
html = """
<html><head><title>The Dormouse‘s story</title></head>
<p clss="title" name="dromouse"><b>The Dormouse‘s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
print(soup.p.string)  #输出P标签的内容The Dormouse‘s story

html = """
<html><head><title>The Dormouse‘s story</title></head>
<p class="title" name="dromouse"><b>The Dormouse‘s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
print(type(soup.head))         #这种格式可以进行嵌套<class ‘bs4.element.Tag‘>
print(soup.head.title.strint)  #soup 的嵌套

html = """
        <title>The Dormouse‘s story</title>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        <p class="story">...</p>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
print(soup.p.contents) #输出P标签里的东西从<p>到</p>

html = """
        <title>The Dormouse‘s story</title>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        <p class="story">...</p>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
for i, child in enumerate(soup.p.children): #i代表索引,children是p内的子节点
    print(i, child)

html = """
        <title>The Dormouse‘s story</title>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        <p class="story">...</p>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
print(soup.a.parent) #输出a标签的父亲节点包括自己


html = """
        <title>The Dormouse‘s story</title>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        <p class="story">...</p>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)

<div class="panel">
    <div class="panel-heading">
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)

<div class="panel">
    <div class="panel-heading">
    <div class="panel-body">
        <ul class="list" id="list-1" name="elements">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
print(soup.find_all(attrs={id: list-1}))#可以理解为寻找里面有id=‘list-1‘的所有标签
print(soup.find_all(attrs={name: elements}))

<div class="panel">
    <div class="panel-heading">
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
for ul in soup.select(ul):

<div class="panel">
    <div class="panel-heading">
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
print(soup.select(.panel .panel-heading))#==class="panel"下class="panel-heading"及其内容
print(soup.select(ul li))#找到ul 下的li标签以列表的形式返回内容
print(soup.select(#list-2 .element))# #list==id="lisr-2" .element==class="element" 返回这个ID下的class

<div class="panel">
    <div class="panel-heading">
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
for li in soup.select(li):
# Foo
# Bar
# Jay
# Foo
# Bar


BeautifulSoup 库

标签:Once   nbsp   auth   import   amp   ref   none   rom   get   


评论 一句话评论(0
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com