标签:
#!/usr/bin/python #coding:utf-8 from pyquery import PyQuery import re # 抓取:http://www.stylebop.com/cn/product_details.php?id=606526&special=sale # 获得 产品名 品牌 价格 size 图片(大图) def main(): pqhtml = PyQuery(url = ‘http://www.stylebop.com/cn/product_details.php?id=606526&special=sale‘) #产品图片: img_li = pqhtml(‘li‘).filter(‘.image_click_rotator‘) pattern_img = re.compile(".*?‘(.*?jpg)‘.*?‘.*?‘.*?‘.*?‘.*?‘(.*?jpg)‘.*?") img_list = [] for li in img_li: #div = li.getchildren()[0] #a = div.getchildren()[0] href = li.getchildren()[0].getchildren()[0].get(‘href‘) items = re.findall(pattern_img,href) img_large = list(items[0])[1] if img_large[0:4] != ‘http‘ : img_large = ‘http://www.stylebop.com%s‘ %img_large img_list.append(img_large) print ‘产品图片:‘ , img_list #产品品牌: brand = pqhtml(‘div‘).filter(‘.productInfo‘)(‘a:first‘).text() print ‘品牌:%s‘ %brand #价格 price_div = pqhtml(‘div‘).filter(‘#product_price‘) #根据ID获取价格的div price_first_span = price_div(‘span:first‘) #获取第一个span old_price = ‘‘ new_price = ‘‘ if price_first_span.hasClass(‘old_price‘): old_price = price_first_span.text new_price = price_div(‘span:eq(1)‘).text() + ‘ / ‘ + price_div(‘span:eq(3)‘).text() else: new_price = price_div.text() + ‘ / ‘ + price_div(‘span:first‘).text print ‘价格:‘ , new_price #print ‘价格:%s‘ % new_price #这样打印会报编码错误:‘ascii‘ codec can‘t encode character u‘\u20ac‘ in position 21: ordinal not in range(128) #size size_option = pqhtml(‘select‘).filter(‘.newInput2‘)(‘option‘) size_list = [] for size in size_option: #为HTMLElement对象 size_list.append(size.text) print ‘size:‘, size_list #产品名: pname = pqhtml(‘div‘).filter(‘.productInfo‘)(‘span:first‘).text() print ‘产品名:%s‘ % pname if __name__ == ‘__main__‘: main()
标签:
原文地址:http://www.cnblogs.com/xccnblogs/p/4894405.html