标签:style blog http color os io for art
1 #coding="utf-8" 2 3 import urllib2 4 import re 5 import threading 6 import time 7 8 """ 9 抓取代理发布页的ip和port10 http://www.xici.net.co/nn/%d 11 """ 12 13 proxylist = [] 14 15 16 def get_proxy_from_cnproxy(): 17 global proxylist 18 19 p = re.compile(r‘<td><img alt="(.+?)" src=".+?" /></td>[\s\S]*?<td>(.+?)</td>[\s\S]*?<td>(.+?)</td>[\s\S]*?<td>[\s\S]*?<a href=".+?">.+?</a>[\s\S]*?</td>[\s\S]*?<td>.+?</td>[\s\S]*?<td>(.+?)</td>‘) 20 21 for i in range(1,2): 22 target = r"http://www.xici.net.co/nn/%d" %i 23 print target 24 req = urllib2.urlopen(target) 25 result = req.read() 26 matchs = p.findall(result) 27 for record in matchs: 28 addr = record[0] 29 ip = record[1] 30 port = record[2] 31 protocol = record[3] 32 l = [ip, port, protocol, addr] 33 #print l 34 proxylist.append(l) 35 print proxylist 36 37 38 class ProxyCheck(threading.Thread): 39 def __init__(self, proxylist, fname): 40 threading.Thread.__init__(self) 41 self.proxylist = proxylist 42 self.timeout = 5 43 self.test_url = "http://www.baidu.com/" 44 self.test_str = "030173" 45 self.checkedPProxyList = [] 46 self.fname = fname 47 48 def checkProxy(self): 49 cookies = urllib2.HTTPCookieProcessor() 50 for proxy in self.proxylist: 51 proxy_handler = urllib2.ProxyHandler({"http":r‘http://%s:%s‘%(proxy[0],proxy[1])}) 52 opener = urllib2.build_opener(cookies, proxy_handler) 53 opener.addheaders = [(‘user-agent‘, ‘mozilla/5.0(iphone; u; cpu like mac os x; en) applewebkit/420+ (khtml, like gecko) version/3.0 mobile/1A537a safari/419.3‘)] 54 urllib2.install_opener(opener) 55 t1 = time.time() 56 try: 57 req = urllib2.urlopen(self.test_url, timeout = self.timeout) 58 result = req.read() 59 timeused = time.time() - t1 60 pos = result.find(self.test_str) 61 if pos > 1: 62 self.checkedPProxyList.append([proxy[0],proxy[1],proxy[2],proxy[3],timeused]) 63 else: 64 continue; 65 except Exception,e: 66 print e.message 67 continue; 68 69 def sort(self): 70 sorted(self.checkedPProxyList,cmp=lambda x,y:cmp(x[4],y[4])) 71 72 def save(self): 73 f = open(self.fname, ‘w+‘) 74 for proxy in self.checkedPProxyList: 75 f.write("%s:%s\t%s\t%s\t%s\n"%(proxy[0],proxy[1],proxy[2],proxy[3],str(proxy[4]))) 76 f.close() 77 78 def run(self): 79 self.checkProxy() 80 self.sort() 81 self.save() 82 83 84 if __name__ == "__main__": 85 get_proxy_from_cnproxy() 86 t1 = ProxyCheck(proxylist,"test.txt") 87 t1.start()
标签:style blog http color os io for art
原文地址:http://www.cnblogs.com/luzhiyuan/p/3889192.html