标签:
#!/usr/bin/python #-*- coding:utf-8 -*- ‘‘‘ 此脚本主要实现网页的点击量,除了实现次功能点外,还有三个知识点: 1、随机获取代理ip,通过代理ip访问指定站点,其目的是防止ip被封 2、访问一个页面后,随机休息几秒,再访问,其目的是防止网站前面有4-7层过滤设备拦截 3、修改http的user agent字段,有些网站和4-7层设备会检查 Created on 2013-7-14 @author: QQ136354553 ‘‘‘ import urllib2,re,time,urllib,random,user_agents PROXYIPURL = ‘http://www.goodips.com/?ip=&port=&dengji=&adr=%E7%94%B5%E4%BF%A1&checktime=&sleep=1%E7%A7%92%E5%86%85&cunhuo=48%E5%B0%8F%E6%97%B6%E4%BB%A5%E4%B8%8A&px=‘ #url = ‘http://blog.csdn.net/chenfei_5201213/article/details/6868634‘ class getProxyIP: # 从网页抓去代理ip ,并整理格式 def getProxyHtml(self): # 抓去代理 ip页面的代码 page = urllib.urlopen(PROXYIPURL) html = page.read() #print html return html def ipPortRe(self): # 从页面代码中取出代理 ip和端口 html = self.getProxyHtml() #ip_re = re.compile(r‘(((2[0-4]\d|25[0-5]|[01]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[01]?\d\d?))‘) ip_re = re.compile(r‘(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}).+\n.+>(\d{1,5})<‘) ip_port = re.findall(ip_re,html) return ip_port def proxyIP(self): # 格式化输出代理 ip和端口 ip_port = self.ipPortRe() # 将代理 ip整理成[‘221.238.28.158:8081‘, ‘183.62.62.188:9999‘]格式 proxyIP = [] for i in range(0,len(ip_port)): proxyIP.append(‘:‘.join(ip_port[i])) # 将代理 ip整理成[{‘http‘: ‘http://221.238.28.158:8081‘}, {‘http‘: ‘http://183.62.62.188:9999‘}]格式 proxy_list = [] for i in range(0,len(proxyIP)): a0 = ‘http://%s‘%proxyIP[i] a1 = {‘http‘:‘%s‘%a0} proxy_list.append(a1) return proxy_list def getHtml(url): p = getProxyIP() proxy_list = p.proxyIP() proxy_ip =random.choice(proxy_list) #在proxy_list中随机取一个ip print proxy_ip proxy_support = urllib2.ProxyHandler(proxy_ip) opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler) urllib2.install_opener(opener) request = urllib2.Request(url) user_agent = random.choice(user_agents.user_agents) #在user_agents中随机取一个做user_agent request.add_header(‘User-Agent‘,user_agent) #修改user-Agent字段 print user_agent html = urllib2.urlopen(request).read() print proxy_ip return proxy_ip URLS = [‘http://www.x‘x‘xxw.net/study.asp?vip=‘, ‘http://www.x‘x‘x‘x‘x‘x.com/?fromuid=16‘, ] count_True,count_False,count= 0,0,0 while True: for url in URLS: count +=1 try: proxy_ip=getHtml(url) except urllib2.URLError: #print ‘URLError! The bad proxy is %s‘ %proxy_ip count_False += 1 except urllib2.HTTPError: #print ‘HTTPError! The bad proxy is %s‘ %proxy_ip count_False += 1 except: #print ‘Unknown Errors! The bad proxy is %s ‘ %proxy_ip count_False += 1 randomTime = random.uniform(1,3) #取1-10之间的随机浮点数 time.sleep(randomTime) #随机等待时间 print ‘%d Eroors,%d ok,总数 %d‘ %(count_False,count - count_False,count)
1 #!/usr/bin/python 2 #-*- coding:utf-8 -*- 3 ‘‘‘ 4 Created on 2013-7-14 5 6 @author: Administrator 7 ‘‘‘ 8 9 user_agents = [ 10 ‘Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11‘, 11 ‘Opera/9.25 (Windows NT 5.1; U; en)‘, 12 ‘Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)‘, 13 ‘Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)‘, 14 ‘Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12‘, 15 ‘Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9‘ 16 ]
标签:
原文地址:http://www.cnblogs.com/chenjingyi/p/5794712.html