标签:python pythno2.7 lxml xpath
Pythno2.7 lxml xpath
1:下载setuptools: https://pypi.python.org/pypi/setuptools,在这里下载Windows(simplified)下边的ez_setup.py
2:安装:在cmd下执行 python ez_setup.py,如果你安装了很多版本的python,则进入相应版本的文件夹,用对应的python.exe ez_setup.py
3:下载想要的lxml安装包
https://pypi.python.org/simple/lxml/
32位、64位:
lxml-2.3-py2.7-win-amd64.egg
lxml-2.3-py2.7-win32.egg
4:安装
进入C:\Python27\Scripts
使用命令行:
easy_install D:\\Downloads\lxml-2.3-py2.7-win-amd64.egg
例子:
#coding:utf-8 import urllib import urllib2 from lxml import etree as etree if __name__ == "__main__": req_url='www.baidu.com' headers= {'User-Agent':'"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0)Gecko/20100101 Firefox/26.0"'} req= urllib2.Request(req_url, headers = headers) content= urllib2.urlopen(req, timeout=60).read() ifisinstance(content, unicode): pass else: content= content.decode('GBK') htmlSource= etree.HTML(content) names= htmlSource.xpath( u'//*[@id="buildhistory"]/div[2]/table/*[@class="finish_mousenone"]/td[2]') fori in names: printi.text
# -*- coding: cp936 -*- import urllib import urllib2 from lxml import etree as etree def InitSogouBranchInfo(req_url=None): ifreq_url == None or req_url == '': print"req_url == none,return" return headers= {'User-Agent':'"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0)Gecko/20100101 Firefox/26.0"'} req= urllib2.Request(req_url, headers = headers) content= urllib2.urlopen(req, timeout=60).read() #printcontent ifisinstance(content, unicode): pass else: content= content.decode('GBK') htmlSource= etree.HTML(content) buildResultRaw= htmlSource.xpath(ur'//*[@id="buildhistory"]/div[2]/table/*[@class="finish_mousenone"]') #buildTime=htmlSource.xpath(ur'//*[@id="buildhistory"]/div[2]/table/*[@class="finish_mousenone"]/td[2]') #buildVersion= htmlSource.xpath(ur'//*[@id="buildhistory"]/div[2]/table/*[@class="finish_mousenone"]/td[3]') #buildBranch=htmlSource.xpath(ur'//*[@id="buildhistory"]/div[2]/table/*[@class="finish_mousenone"]/td[6]') #buildAuthor=htmlSource.xpath(ur'//*[@id="buildhistory"]/div[2]/table/*[@class="finish_mousenone"]/td[7]') #buildDownloadUrl=htmlSource.xpath(ur'//*[@id="buildhistory"]/div[2]/table/*[@class="finish_mousenone"]/td[12]/a') result= { 'buildTime' :[], 'buildVersion' :[], 'buildBranch' :[], 'buildAuthor' :[], 'buildDownloadUrl' :[] } sum= 0 fortree in buildResultRaw: str= '' buildTime= tree.xpath(ur'td[2]') iflen(buildTime)>0 and not buildTime[0].text==None: str= str + buildTime[0].text+' ' result['buildTime'].append(buildTime[0].text) else: result['buildTime'].append("") buildVersion= tree.xpath(ur'td[3]') iflen(buildVersion)>0 and not buildVersion[0].text==None: str= str + buildVersion[0].text+' ' result['buildVersion'].append(buildVersion[0].text) else: result['buildVersion'].append("") buildBranch= tree.xpath(ur'td[6]') iflen(buildBranch)>0 and not buildBranch[0].text==None: str= str + buildBranch[0].text+' ' result['buildBranch'].append(buildBranch[0].text) else: result['buildBranch'].append("") buildAuthor= tree.xpath(ur'td[7]') iflen(buildAuthor)>0 and not buildAuthor[0].text==None: str= str + buildAuthor[0].text+' ' result['buildAuthor'].append(buildAuthor[0].text) else: result['buildAuthor'].append("") buildDownloadUrl= tree.xpath(ur'td[12]/a/@href') ifnot buildDownloadUrl==[]: str= str + buildDownloadUrl[0]+' ' result['buildDownloadUrl'].append(buildDownloadUrl[0]) else: result['buildDownloadUrl'].append("") #printstr #sum= sum + 1 #printsum returnresult if __name__ == "__main__": result=InitSogouBranchInfo('http://build.sogou-inc.com/system_build/common_module/project.php?project=ime&class=ime&branch=branch\PinyinDev_R_7_4_Update_Kernel50') f= open(r'd:/t.txt','w') length= len(result['buildVersion']) fori in range(0,length): printresult['buildDownloadUrl'][i]
参考:
http://www.cnblogs.com/zhuyp1015/archive/2012/07/17/2596495.html
http://blog.sina.com.cn/s/blog_641289eb0100yf84.html
http://blog.csdn.net/zhaokuo719/article/details/8209496
http://blog.csdn.net/shirdrn/article/details/7030026
http://bbs.csdn.net/topics/390823000
http://www.cnblogs.com/bluescorpio/archive/2010/05/31/1748503.html
标签:python pythno2.7 lxml xpath
原文地址:http://blog.csdn.net/lileiyang12/article/details/41962385