标签:
文章转载自:https://blog.linuxeye.com/410.html
代理服务器:http://www.proxy.com.ru
1 #coding: utf-8 2 3 import urllib2 4 import re 5 import time 6 import threading 7 import MySQLdb 8 9 rawProxyList = [] 10 checkedProxyList = [] 11 12 #抓取代理网站 13 targets = [] 14 for i in xrange(1, 23): 15 target = r"http://www.proxy.com.ru/list_%d.html" % i 16 targets.append(target) 17 #print target + "\n" 18 19 #抓取代理服务器正则 20 p = re.compile(r‘‘‘<tr><b><td>(\d+)</td><td>(.+?)</td><td>(\d+)</td><td>(.+?)</td><td>(.+?)</td></b></tr>‘‘‘) 21 22 #获取代理的类 23 24 class ProxyGet(threading.Thread): 25 def __init__(self, target): 26 threading.Thread.__init__(self) 27 self.target = target 28 29 30 def getProxy(self): 31 req = urllib2.Request(self.target) 32 respnse = urllib2.urlopen(req) 33 result = respnse.read() 34 matches = p.findall(result) 35 #print matches 36 for row in matches: 37 ip = row[1] 38 port = row[2] 39 addr = row[4].decode("cp936").encode("utf-8") 40 proxy = [ip, port, addr] 41 #print proxy 42 rawProxyList.append(proxy) 43 44 45 def run(self): 46 self.getProxy() 47 48 #核对代理是否有效的类 49 class ProxyCheck(threading.Thread): 50 def __init__(self,proxyList): 51 threading.Thread.__init__(self) 52 self.proxyList = proxyList 53 self.timeout = 5 54 self.testUrl = "http://www.baidu.com/" 55 self.testStr = "030173" 56 57 def checkProxy(self): 58 cookies = urllib2.HTTPCookieProcessor() 59 for proxy in self.proxyList: 60 proxyHandler = urllib2.ProxyHandler({"http": r‘http://%s:%s‘ %(proxy[0], proxy[1])}) 61 #print r‘http://%s:%s‘ %(proxy[0],proxy[1]) 62 opener = urllib2.build_opener(cookies, proxyHandler) 63 opener.addheaders = [(‘User-agent‘, ‘Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0‘)] 64 #urllib2.install_opener(opener) 65 t1 = time.time() 66 67 try: 68 #req = urllib2.urlopen("http://www.baidu.com", timeout=self.timeout) 69 req = opener.open(self.testUrl, timeout=self.timeout) 70 #print "urlopen is ok...." 71 result = req.read() 72 #print "read html...." 73 timeused = time.time() - t1 74 pos = result.find(self.testStr) 75 #print "pos is %s" %pos 76 77 if pos >= 1: 78 checkedProxyList.append((proxy[0], proxy[1], proxy[2], timeused)) 79 print "ok ip: %s %s %s %s" %(proxy[0],proxy[1],proxy[2],timeused) 80 else: 81 continue 82 except Exception, e: 83 #print e.message 84 continue 85 86 def run(self): 87 self.checkProxy() 88 89 90 if __name__ == "__main__": 91 getThreads = [] 92 checkThreads = [] 93 94 #对每个目标网站开启一个线程负责抓取代理 95 for i in range(len(targets)): 96 t = ProxyGet(targets[i]) 97 getThreads.append(t) 98 99 for i in range(len(getThreads)): 100 getThreads[i].start() 101 102 for i in range(len(getThreads)): 103 getThreads[i].join() 104 105 print ‘.‘*10 + "总共抓取了%s个代理" % len(rawProxyList) + ‘.‘*10 106 107 #开启20个线程负责校验,将抓取到的代理分成20份,每个线程校验一份 108 for i in range(20): 109 t = ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20) * i:((len(rawProxyList)+19)/20) * (i+1)]) 110 checkThreads.append(t) 111 112 for i in range(len(checkThreads)): 113 checkThreads[i].start() 114 115 for i in range(len(checkThreads)): 116 checkThreads[i].join() 117 118 print ‘.‘*10 + "总共抓取了%s个代理" % len(checkedProxyList) + ‘.‘*10 119 120 #插入数据库,四个字段ip, port, speed, addr 121 def db_insert(insert_list): 122 try: 123 conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="meimei1118", db="ctdata", charset=‘utf8‘) 124 cursor = conn.cursor() 125 cursor.execute(‘delete from proxy‘) 126 cursor.execute(‘alter table proxy AUTO_INCREMENT=1‘) 127 cursor.executemany("INSERT INTO proxy(ip,port,speed,address) VALUES(%s, %s, %s,%s)", insert_list) 128 conn.commit() 129 cursor.close() 130 conn.close() 131 132 except MySQLdb.Error, e: 133 print "Mysql Error %d: %s" %(e.args[0], e.args[1]) 134 135 #代理排序持久化 136 proxy_ok = [] 137 for proxy in sorted(checkedProxyList, cmp=lambda x, y: cmp(x[3], y[3])): 138 if proxy[3] < 8: 139 #print "checked proxy is: %s:%s\t%s\t%s" %(proxy[0],proxy[1],proxy[2],proxy[3]) 140 proxy_ok.append((proxy[0], proxy[1], proxy[3], proxy[2])) 141 142 db_insert(proxy_ok)
标签:
原文地址:http://www.cnblogs.com/nju2014/p/4614698.html