标签:
主要就是用了两个库,urllib和BeautifulSoup.
作用是从HTML中解析出解梦的查询词和具体的解释。
1 # -*- coding: utf-8 -*- 2 import urllib, urllib2 3 import time, random 4 from BeautifulSoup import BeautifulSoup 5 6 def fetchURL(str_url): 7 8 user_agent = ‘Mozilla/5.0 (Windows NT 6.1; WOW64) 9 AppleWebKit/537.36 (KHTML, like Gecko)‘ 10 values = {} 11 headers = {‘User-Agent‘: user_agent} 12 data = urllib.urlencode(values) 13 14 content = ‘‘ 15 16 try: 17 request = urllib2.Request(str_url) 18 response = urllib2.urlopen(request) 19 html = response.read().decode(‘gb2312‘) 20 content = parse_content_page(html) 21 except: 22 content = None 23 24 return content 25 26 def parse_content_page(html): 27 parsed_html = BeautifulSoup(html) 28 try: 29 title = parsed_html.body.find(‘h1‘, attrs={‘class‘:‘art_title‘}).text 30 content = parsed_html.body.find(‘div‘, attrs={‘class‘:‘dream_detail‘}).text 31 except: 32 return None 33 34 return [title, content] 35 36 37 38 if __name__ == ‘__main__‘: 39 40 foutput = ‘jiemeng.txt‘ 41 with open(foutput, ‘w‘) as fout: 42 for i in xrange(1, 10): 43 reques_url = ‘http://tools.2345.com/zhgjm/%s.htm‘ % str(i) 44 x = fetchURL(reques_url) 45 if x != None: 46 print >>fout, x[0].encode(‘utf8‘)[3:-3] 47 print >>fout, x[1].encode(‘utf8‘) 48 49 # sleep for a while between two http requests 50 seconds = random.random()*10 + 2 51 time.sleep(seconds)
标签:
原文地址:http://www.cnblogs.com/naive/p/4306990.html