标签:
1 __author__ = ‘minmin‘ 2 #coding:utf-8 3 import re,urllib,sgmllib 4 5 #根据当前的url获取html 6 def getHtml(url): 7 page = urllib.urlopen(url) 8 html = page.read() 9 page.close() 10 return html 11 12 #根据html获取想要的文章内容 13 def func(str): 14 result= re.findall(r"<p style=\"TEXT-INDENT: 30px; MARGIN: 0px 3px 15px\">([^<>]*)</p>",getHtml(url),re.M) or re.findall(r"<p>([^<>]*)</p>",getHtml(url),re.M) 15 # or re.findall( r"<p style=\"TEXT-JUSTIFY: distribute; TEXT-ALIGN: justify\" align=\"justify\">(.*?)</p>",getHtml(url),re.M) 16 artical =‘‘ 17 for j in result: 18 if len(j)<>0: 19 j = j.replace(" ","") 20 j = j.replace("<strong>"," ")#去掉<STRONG>,换成" " 21 j = j.replace("</strong>"," ")#去掉</STROGN>换成" " 22 artical = artical + j + ‘\n‘ 23 return artical 24 25 #html链接的标签是“a”,链接的属性是“href”,也就是要获得html中所有tag=a,attrs=href 值。 26 class URLPaser(sgmllib.SGMLParser): 27 def reset(blank): 28 sgmllib.SGMLParser.reset(blank) 29 blank.urls = [] 30 31 def start_a(blank,attrs): 32 href = [v for k,v in attrs if k == ‘href‘] 33 if href: 34 blank.urls.extend(href) 35 36 IParser = URLPaser() 37 socket = urllib.urlopen("http://travel.gmw.cn/node_39034.htm")#打开这个网页 38 39 #fout = file(‘qq_art_urls.txt‘,‘w‘)#要把这个链接写到这个文件中 40 IParser.feed(socket.read())#分析啦 41 42 reg = ‘http://travel.gmw.cn/2015-.*‘ #这个是用来匹配符合条件的链接,使用正则表达式匹配 43 reg2= ‘http://travel.gmw.cn/2014-.*‘ 44 pattern = re.compile(reg) 45 patter = re.compile(reg2) 46 i= 0 47 url2=[] 48 for url in IParser.urls:#链接都存在urls里 49 url = "http://travel.gmw.cn/" + url 50 if pattern.match(url): 51 if url not in url2: 52 url2.append(url) 53 print url 54 artical = func(url) 55 print artical 56 if len(artical)<>0: 57 i = i + 1 58 f = open("gmw/travel/"+str(i) + ‘.txt‘,‘a+‘) 59 f.write(artical) 60 f.close() 61 62 if patter.match(url): 63 if url not in url2: 64 url2.append(url) 65 print url 66 print artical 67 if len(artical)<>0: 68 i = i + 1 69 f = open("gmw/travel/"+str(i) + ‘.txt‘,‘a+‘) 70 f.write(artical) 71 f.close()
标签:
原文地址:http://www.cnblogs.com/minmsy/p/4962710.html