python爬虫抓取站长之家IP库,单线程的,仅供练习,IP库数据有43亿条,如果按此种方法抓取至少得数年,所以谨以此作为练手,新手代码很糙,请大家见谅。
#!/usr/bin/python
#coding=UTF-8
import urllib2
import re
import os
import csv
import codecs
user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘
headers = { ‘User-Agent‘ : user_agent }
def gethtml(url):
user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘
headers = { ‘User-Agent‘ : user_agent }
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)
html=response.read()
return html
def getdate(html):
r=r‘<span class="Whwtdhalf w50\-0">(.*?)</span>\s*</p>‘
rs=re.compile(r,re.S)
iplace=rs.findall(html)
return iplace
def getdataid(html):
r=r‘<span class="Whwtdhalf w15\-0">(\d+?)</span>‘
rs=re.compile(r)
dataid=rs.findall(html)
return dataid
def geturl(ip):
url=‘http://ip.chinaz.com/%s‘%ip
return url
count=1
with open(‘ipku.csv‘,‘a+‘) as csvfile:
csvfile.write(codecs.BOM_UTF8)
spamwriter=csv.writer(csvfile,dialect=‘excel‘)
spamwriter.writerow([‘数字地址‘,‘IP地址‘,‘服务器地址‘])
for k in xrange(211,256):
for v in xrange(0,256):
for m in xrange(10,256):
for n in xrange(0,256):
ip=str(k)+‘.‘+str(v)+‘.‘+str(m)+‘.‘+str(n)
url=geturl(ip)
html=gethtml(url)
dataid=getdataid(html)
iplace=getdate(html)
for h in dataid:
print count
count+=1
spamwriter.writerow([h,ip,iplace[1]])打印count是因为让大家看到进程在运行,没特殊意义。
本文出自 “全球互联云主机Q874247458” 博客,请务必保留此出处http://gosweet.blog.51cto.com/11759495/1905737
原文地址:http://gosweet.blog.51cto.com/11759495/1905737