标签:style blog http color strong os
Beatiful Soup生成商品详情页面的剖析树,
主要函数:findAll(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
利用findAll先获取标签范围的内容,再利用正则表达式进行匹配输出。
Beatiful Soup的中文文档:
http://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html#Searching%20the%20Parse%20Tree
程序:
1 #!/usr/bin/python 2 import urllib2 3 import sys 4 import chardet 5 import re 6 from BeautifulSoup import BeautifulSoup 7 def html(): 8 # rfile = open(urllist,‘rb‘) 9 # buf = rfile.read().split(‘\n‘) 10 # rfile.close() 11 # for i in range(len(buf)): 12 # website = buf[i] 13 # print website 14 website = raw_input("input link:") 15 page = urllib2.urlopen(website).read() 16 mychar=chardet.detect(page) 17 # print mychar 18 html = BeautifulSoup(page) 19 # print html.originalEncoding 20 # html = BeautifulSoup(pageg, fromEncoding="gbk") 21 m = re.match(‘http:\/\/(.*).(com|cn)‘,website).group(1) 22 patt = ‘[1-9][0-9]*(?:\.[0-9]+)?|0\.[0-9]+]‘ 23 if m == ‘item.taobao‘: 24 price = html.find(attrs={"class":"tb-public-price"}) 25 match1 = re.search(patt,str(price)) 26 img = html.find(attrs={"id":"J_ImgBooth"}) 27 match2 = re.search(‘src="(http.*jpg)"‘,str(img)) 28 print "title:",html.title.text 29 print "price:",match1.group() 30 print "img:",match2.group(1) 31 elif m == ‘detail.tmall‘ or m == ‘chaoshi.detail.tmall‘: 32 price = html.find(attrs={"class":"detail-price tm-clear"}) 33 match1 = re.search(patt,str(price)) 34 img = html.find(attrs={"id":"J_ImgBooth"}) 35 match2 = re.search(‘src="(http.*jpg)"‘,str(img)) 36 print "title:",html.title.text 37 print "price:",match1.group() 38 print "img:",match2.group(1) 39 elif m == ‘detail.ju.taobao‘: 40 price = html.find(attrs={"class":"currentPrice floatleft"}) 41 img = html.find(attrs={"class":"normal-pic "}) 42 if img == None : 43 img = html.find(attrs={"class":"item-pic-wrap"}) 44 match1 = re.search(patt,str(price)) 45 match2 = re.search(‘src="(http[^\"]*?)"‘,str(img)) 46 print "title:",html.title.text 47 print "price:",match1.group() 48 print "img:",match2.group(1) 49 else: 50 print website 51 if __name__ == ‘__main__‘: 52 html()
运行结果:
----@ubuntu:~/python$ python html.py input link:http://item.taobao.com/item.htm?spm=1.7274553.1997522421.1.FKA5Ar&id=38443208410&scm=2004.1.515.0 title: 2014夏装新款欧美风ZARA MICN女装衬衫白底定位印花长袖雪纺衫女-淘宝网 price: 43.00 img: http://img03.taobaocdn.com/bao/uploaded/i3/T1MnJaFJXeXXXXXXXX_!!0-item_pic.jpg_400x400.jpg
-----@ubuntu:~/python$ python html.py input link:http://detail.ju.taobao.com/home.htm?spm=608.2291429.1.d1.tmDQQs&item_id=39165873670&id=10000002887630 title: 【聚_世界杯】【三只松鼠】爆款坚果组合750g-聚划算团购 price: 42.90 img: http://gju3.alicdn.com/bao/uploaded/i1/T1aV7LFGRcXXb1upjX.jpg_400x400Q90.jpg
Beatiful Soup获取淘宝商品详情,布布扣,bubuko.com
标签:style blog http color strong os
原文地址:http://www.cnblogs.com/tonto/p/3820742.html