标签:
刚刚接触爬虫,马上用xpath对最近很关注的我的手机安卓6.0rom消息和某论坛当天新帖进行爬取。应该算是我第一个真正意义上的爬虫代码了,很简短。
1 #-*-coding:utf-8-*- 2 from lxml import etree 3 import requests 4 5 def spider_ROM(url): 6 html = requests.get(url) 7 selector = etree.HTML(html.text) 8 # name = selector.xpath(‘//body/div[2]/div[3]/div/div[2]/div/ul/li/h4/a/text()‘) 9 name = selector.xpath(‘//*[@class="list-group files"]/li/h4/a/text()‘) 10 size = selector.xpath(‘//*[@class="list-group files"]/li/p/span[@class="info"]/text()‘) 11 date = selector.xpath(‘//*[@class="list-group files"]/li/p/span[@class="date"]/text()‘) 12 i = 0 13 for each in name: 14 print each 15 print date[i]+‘ ‘, 16 if i==3: 17 print size[i] 18 else: 19 print size[i] + ‘\n‘ 20 i = i + 1 21 print url+‘\n‘ 22 23 def spider_Jifeng(url): 24 html = requests.get(url) 25 selector = etree.HTML(html.text) 26 content_field = selector.xpath(‘//*[starts-with(@id,"normalthread")]/tr‘) 27 for each in content_field: 28 pre_title = each.xpath(‘th/em/a/text()‘) 29 if pre_title: 30 pre_title = pre_title[0] 31 else: 32 pre_title = each.xpath(‘th/em/a/font/text()‘)[0] 33 title = each.xpath(‘th/a/text()‘)[0] 34 time = each.xpath(‘td[2]/em/span/font/span/text()‘) 35 if time: 36 time = time[0] 37 else: 38 time = each.xpath(‘td[2]/em/span/span/text()‘)[0] 39 #只显示今天的帖子 40 if each.xpath(‘td[2]/em/span/font/@color‘)[0] == ‘#0000FF‘: 41 break 42 print time.replace(u‘\xa0‘, u‘‘) +‘ ‘,u‘【‘+pre_title+u‘】‘,title.replace(‘ ‘,‘‘) 43 print url 44 45 if __name__ == ‘__main__‘: 46 url_rom = ‘https://www.androidfilehost.com/?w=search&s=d802‘ 47 url_jifeng = ‘http://bbs.gfan.com/forum.php?mod=forumdisplay&fid=1345&filter=author&orderby=dateline‘ 48 print ‘‘ 49 print ‘‘ 50 print u‘ |【LG G2 安卓6.0rom 消息更新】|‘ 51 spider_ROM(url_rom) 52 print ‘‘ 53 print u‘ |【机锋论坛 LG G2 今日 新帖汇总更新】|‘ 54 spider_Jifeng(url_jifeng) 55 print ‘‘ 56 print ‘‘
标签:
原文地址:http://www.cnblogs.com/chenxy93/p/4973895.html