标签:节点 result htm cond des als contains sel path
参考:https://cuiqingcai.com/5545.html
XPath
XPath常用规则
text = ‘‘‘ <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </ul> </div> ‘‘‘
from lxml import etree selector = etree.HTML(text) result = selector.xpath(‘//*‘) print(result)
输出
[<Element html at 0x1761bfd5508>, <Element body at 0x1761bfd5a88>, <Element div at 0x1761bfd5ac8>, <Element ul at 0x1761bfd5b08>, <Element li at 0x1761bfd5e88>, <Element a at 0x1761bfd5f08>, <Element li at 0x1761bfd5f48>, <Element a at 0x1761bfd5f88>, <Element li at 0x1761bfd5fc8>, <Element a at 0x1761bfd5ec8>, <Element li at 0x1761bfdb048>, <Element a at 0x1761bfdb088>, <Element li at 0x1761bfdb0c8>, <Element a at 0x1761bfdb108>]
from lxml import etree selector = etree.HTML(text) result = selector.xpath(‘//li/a‘) print(result)
输出
[<Element a at 0x1761c02dec8>, <Element a at 0x1761c02de88>, <Element a at 0x1761c02df08>, <Element a at 0x1761c02df48>, <Element a at 0x1761c02df88>]
from lxml import etree selector = etree.HTML(text) result = selector.xpath(‘//li/..‘) print(result)
输出
[<Element ul at 0x1761ae7c288>]
from lxml import etree selector = etree.HTML(text) result = selector.xpath(‘//li[@class="item-0"]‘) print(result)
输出
[<Element li at 0x1761afe2dc8>, <Element li at 0x1761c067748>]
from lxml import etree selector = etree.HTML(text) result1 = selector.xpath(‘//li[@class="item-0"]/text()‘) result2 = selector.xpath(‘//li[@class="item-0"]/a/text()‘) print(result1) print(result2)
输出
[‘\n ‘] [‘first item‘, ‘fifth item‘]
注://li[@class="item-0"]/text()得到[‘\n ‘] 因"/"是获取直接子节点
from lxml import etree selector = etree.HTML(text) result = selector.xpath(‘//li[@class="item-0"]/a/@href‘) print(result)
输出
[‘link1.html‘, ‘link5.html‘]
from lxml import etree text1 = ‘‘‘ <li class="li li-first"><a href="link.html">first item</a></li> ‘‘‘ selector = etree.HTML(text1) result1 = selector.xpath(‘//li[@calss="li"]/a/text()‘) result2 = selector.xpath(‘//li[contains(@class,"li")]/a/text()‘) print(result1) print(result2)
输出
[] [‘first item‘]
from lxml import etree text2 = ‘‘‘ <li class="li li-first" name="item"><a href="link.html">first item</a></li> ‘‘‘ selector = etree.HTML(text2) result = selector.xpath(‘//li[contains(@class,"li") and @name="item"]/a/text()‘) print(result
输出
[‘first item‘]
from lxml import etree text = ‘‘‘ <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </ul> </div> ‘‘‘ selector = etree.HTML(text) result1 = selector.xpath(‘//li[1]/a/text()‘) print(result1) result2 = selector.xpath(‘//li[last()]/a/text()‘) print(result2) result3 = selector.xpath(‘//li[position()<3]/a/text()‘) print(result3) result4 = selector.xpath(‘//li[last()-2]/a/text()‘) print(result4)
输出
[‘first item‘] [‘fifth item‘] [‘first item‘, ‘second item‘] [‘third item‘]
from lxml import etree text3 = ‘‘‘ <div> <ul> <li class="item-0"><a href="link1.html"><span>first item</span></a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </ul> </div> ‘‘‘ selector = etree.HTML(text3) result1 = selector.xpath(‘//li[1]/ancestor::*‘) print(result1) result2 = selector.xpath(‘//li[1]/ancestor::div‘) print(result2) result3 = selector.xpath(‘//li[1]/attribute::*‘) print(result3) result4 = selector.xpath(‘//child::a[@href="link1.html"]‘) print(result4) result5 = selector.xpath(‘//li[1]/descendant::span‘) print(result5) result6 = selector.xpath(‘//li[1]/following::*[2]‘) print(result6) result7 = selector.xpath(‘//li[1]/following-sibling::*‘) print(result7)
输出
[<Element html at 0x1761c02db88>, <Element body at 0x1761c07bf08>, <Element div at 0x1761c078308>, <Element ul at 0x1761c086088>] [<Element div at 0x1761c078308>] [‘item-0‘] [<Element a at 0x1761c086288>] [<Element span at 0x1761c06e6c8>] [<Element a at 0x1761c06e688>] [<Element li at 0x1761c078b08>, <Element li at 0x1761c078648>, <Element li at 0x1761c0864c8>, <Element li at 0x1761c086448>]
标签:节点 result htm cond des als contains sel path
原文地址:https://www.cnblogs.com/locke-hu/p/9236409.html