标签:return content sts class lag bsp star name bytes
1 import sys 2 import multiprocessing 3 import re 4 import os 5 import urllib.request as lib 6 7 def craw_links( url,depth,keyword,processed): 8 ‘‘‘ url:the url to craw 9 deth:the current depth to craw 10 keyword:the tuple of keywords to focus 11 pool:process pool 12 ‘‘‘ 13 14 contents=[] 15 if url.startswith((‘htpp://‘,‘https://‘)): 16 if url not in processed: 17 #mark this url as processed 18 processed.append(url) 19 else: 20 #avoid prossing the same url again 21 return 22 print(‘Crawing ‘+url+‘...‘) 23 fp = lib.urlopen(url) 24 #python3 returns bytes,so need to decode 25 contents = fp.read() 26 contents_decoded = contents.decode(‘UTF-8‘) 27 fp.close() 28 pattern = ‘|‘.join(keyword) 29 #if this page contains certain keywords,save it to a file 30 flag = False 31 if pattern: 32 searched = re.search(pattern,contents_decoded) 33 else: 34 #if the keywords to filter is not given,save current page 35 flag = True 36 if flag or searched: 37 with open(‘craw\\‘+url.replace(‘:‘,‘_‘).replace(‘/‘,‘_‘),‘wb‘) as fp: 38 fp.write(contents) 39 #find all the links in the current page 40 links = re.findall(‘href="(.*?)"‘,contents_decoded) 41 #craw all links in the current page 42 for link in links: 43 #consider the relative path 44 if not link.startswith((‘http://‘,‘https://‘)): 45 try: 46 index=url.rindex(‘/‘) 47 link = url[0:index+1]+link 48 except: 49 pass 50 if depth>0 and link.endswith((‘.htm‘,‘.html‘)): 51 craw_links(link,depth-1,keyword,processed) 52 53 if __name__ == ‘__main__‘: 54 processed = [] 55 keywords = (‘KeyWord1‘,‘KeyWord2‘) 56 if os.path.exists(‘craw‘) or not os.path.isdir(‘craw‘): 57 os.mkdir(‘craw‘) 58 craw_links(r‘http://docs.python.org/3/library/index.html‘,1,keywords,processed)
标签:return content sts class lag bsp star name bytes
原文地址:http://www.cnblogs.com/cmnz/p/7096607.html