python爬虫抓取站长之家IP库,单线程的,仅供练习,IP库数据有43亿条,如果按此种方法抓取至少得数年,所以谨以此作为练手,新手代码很糙,请大家见谅。
#!/usr/bin/python #coding=UTF-8 import urllib2 import re import os import csv import codecs user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘ headers = { ‘User-Agent‘ : user_agent } def gethtml(url): user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘ headers = { ‘User-Agent‘ : user_agent } request=urllib2.Request(url,headers=headers) response=urllib2.urlopen(request) html=response.read() return html def getdate(html): r=r‘<span class="Whwtdhalf w50\-0">(.*?)</span>\s*</p>‘ rs=re.compile(r,re.S) iplace=rs.findall(html) return iplace def getdataid(html): r=r‘<span class="Whwtdhalf w15\-0">(\d+?)</span>‘ rs=re.compile(r) dataid=rs.findall(html) return dataid def geturl(ip): url=‘http://ip.chinaz.com/%s‘%ip return url count=1 with open(‘ipku.csv‘,‘a+‘) as csvfile: csvfile.write(codecs.BOM_UTF8) spamwriter=csv.writer(csvfile,dialect=‘excel‘) spamwriter.writerow([‘数字地址‘,‘IP地址‘,‘服务器地址‘]) for k in xrange(211,256): for v in xrange(0,256): for m in xrange(10,256): for n in xrange(0,256): ip=str(k)+‘.‘+str(v)+‘.‘+str(m)+‘.‘+str(n) url=geturl(ip) html=gethtml(url) dataid=getdataid(html) iplace=getdate(html) for h in dataid: print count count+=1 spamwriter.writerow([h,ip,iplace[1]])
打印count是因为让大家看到进程在运行,没特殊意义。
本文出自 “全球互联云主机Q874247458” 博客,请务必保留此出处http://gosweet.blog.51cto.com/11759495/1905737
原文地址:http://gosweet.blog.51cto.com/11759495/1905737