from urllib import request from lxml import etree # url = ‘‘‘http://bangumi.tv/anime/browser?sort=rank‘‘‘ # response = request.urlopen(url) # html = response.read() html = ‘‘‘ id="item_1728" class="item even clearit"> <a href="/subject/1728" class="subjectCover cover ll"> <span class="image"> <img src="//lain.bgm.tv/pic/cover/s/71/37/1728_HLsCr.jpg" class="cover"> </span> <span class="overlay"></span> </a> <div class="inner"> <h3> <a href="/subject/1728" class="l">浪客剑心 追忆篇</a> <small class="grey">るろうに剣心 -明治剣客浪漫譚- 追憶編</small> </h3> <span class="rank"><small>Rank </small>12</span> <p class="info tip"> 4话 / 1999年2月20日 </p> <p class="rateInfo"> <span class="sstars9 starsinfo"></span> <small class="fade">8.8</small> <span class="tip_j">(2165人评分)</span> </p> </div> </li> ‘‘‘ html = etree.HTML(html) result = etree.tostring(html) print(result) li_all = html.xpath(‘//a‘) print(li_all)#[<Element a at 0x2ebe198>, <Element a at 0x2ebe170>] # li_all = html.xpath(‘//a/@href‘)[‘/subject/1728‘, ‘/subject/1728‘] # print(li_all) li_all = html.xpath(‘//a/@class‘)#[‘subjectCover cover ll‘, ‘l‘] print(li_all) li_all = html.xpath(‘//a[@href="/subject/1728"]‘)#获取所有href等于这个的标签 print(li_all) li_all = html.xpath(‘//div/a‘)#获取a标签下所有的子span标签 print(li_all) li_all = html.xpath(‘//div//a‘)#获取a标签下所有的子孙span标签 print(li_all) li_all = html.xpath(‘//div//a//@class‘)#获取a标签下所有的子孙span标签 print(li_all) li_all = html.xpath(‘//div//p[last()]/span‘)#获取最后一个p元素的所有span标签 print(li_all) li_all = html.xpath(‘//div//p[last()-1]‘)#获取倒数第二个个p元素的所有span标签 print(li_all[0].text) li_all = html.xpath(‘string()‘)#过滤标签,返回所有文本 print(li_all) li_all = html.xpath(‘//text()‘)#过滤标签,将每个文本存放于列表中 print(li_all) li_all = html.xpath(‘//text()‘) print(li_all[0].getparent().tag)#根据文本返回它的标签名 print(li_all[1].is_tail) print(li_all[1].is_tail)#判断是普通文本还是tail文本