标签:
qqzeng-ip.dat是一个特殊格式的dat文件,可以快速的查找IP对应的地理位置信息。据作者测试的结果来看,是100万ip查找速度0.5秒。
当然这和语言有非常大的关系,python的循环性能一直是为人所诟病的。目前python版本测试的结果是10万IP的查找速度是3.X秒左右,还算够用,毕竟真实情况下的30秒~5分钟内的日志不太可能出现一批数据中的不重复IP超过10万个。
作者提供了解析dat的java/c/php脚本,但没有提供python版本的。所以我就写了一个,以供需要用python语言读取ip的地理位置信息使用。
代码如下:
#coding:utf-8 import os import math import socket import struct import io from io import SEEK_SET path = os.path.normpath(os.path.dirname(os.path.abspath(__file__)) + "/qqzeng-ip-utf8.dat") class IpSearch(object): fp = None firstStartIpOffset = None lastStartIpOffset = None preStartOffset = None preEndOffset = None ipCount = None prefixCount = None prefixList = dict() def __init__(self): self.fp = io.open(path,"rb") buff = self.fp.read(16) self.firstStartIpOffset = self.bytesToLong(buff[0],buff[1],buff[2],buff[3]) self.lastStartIpOffset = self.bytesToLong(buff[4],buff[5],buff[6],buff[7]) self.preStartOffset = self.bytesToLong(buff[8],buff[9],buff[10],buff[11]) self.preEndOffset = self.bytesToLong(buff[12],buff[13],buff[14],buff[15]) self.ipCount = (self.lastStartIpOffset - self.firstStartIpOffset) / 12 + 1 self.prefixCount = (self.preEndOffset - self.preStartOffset) / 9 + 1 self.fp.seek(self.preStartOffset,SEEK_SET) preBuff = self.fp.read(self.prefixCount*9) for k in range(0,self.prefixCount): i = k*9 startIndex = self.bytesToLong(preBuff[1+i],preBuff[2+i],preBuff[3+i],preBuff[4+i]) endIndex = self.bytesToLong(preBuff[5+i],preBuff[6+i],preBuff[7+i],preBuff[8+i]) self.prefixList[ord(preBuff[i])] = { "start_index":startIndex, "end_index":endIndex } def __del__(self): if self.fp != None: self.fp.close() def get(self,ip): if ip == '': return "" high = 0 low = 0 startIp = 0 endIp = 0 localOffset = 0 localLength = 0 prefix = ip.split(".")[0] prefix = int(prefix) ipnum = self.ip2unit(ip) if prefix in self.prefixList.keys(): index = self.prefixList[prefix] low = index["start_index"] high = index["end_index"] else: return "" left = low if low == high else self.binarySearch(low,high,ipnum) left,startIp,endIp,localOffset,localLength = self.getIndex(left,startIp,endIp,localOffset,localLength) if startIp <= ipnum and endIp >= ipnum: return self.getLocal(localOffset,localLength) else: return "" def getLocal(self,localOffset,localLength): self.fp.seek(localOffset,SEEK_SET) return self.fp.read(localLength) def getIndex(self,left,startIp,endIp,localOffset,localLength): leftOffset = self.firstStartIpOffset + left*12 self.fp.seek(leftOffset,SEEK_SET) buff = self.fp.read(12) startIp = self.bytesToLong(buff[0],buff[1],buff[2],buff[3]) endIp = self.bytesToLong(buff[4],buff[5],buff[6],buff[7]) r3 = (ord(buff[8]) << 0 | ord(buff[9]) << 8 | ord(buff[10]) << 16) if r3 < 0: r3 += 4294967296 localOffset = r3 localLength = ord(buff[11]) return [left,startIp,endIp,localOffset,localLength] def binarySearch(self,low,high,k): m = 0 while low <= high: mid = (low + high)/2 endIpNum = self.getEndIpNum(mid) if endIpNum >= k: m = mid if mid == 0: break high = mid - 1 else: low = mid + 1 return m def getEndIpNum(self,left): leftOffset = self.firstStartIpOffset + (left*12) + 4 self.fp.seek(leftOffset,SEEK_SET) buf = self.fp.read(4) return self.bytesToLong(buf[0],buf[1],buf[2],buf[3]) def ip2unit(self,ip): lip = self.ip2long(ip) if lip < 0: lip += 4294967296 return lip def ip2long(self,ip): packedIP = socket.inet_aton(ip) return struct.unpack("!L", packedIP)[0] def bytesToLong(self,a,b,c,d): iplong = (ord(a) << 0) | (ord(b) << 8) | (ord(c) << 16) | (ord(d) << 24) if iplong < 0: iplong += 4294967296 return iplong if __name__ == '__main__': ipSearch = IpSearch() print ipSearch.get("210.51.200.123").decode("utf-8").encode("gbk") import time startTime = time.time() for i in range(0,100000): ipSearch.get("210.51.200.123") endTime = time.time() print "time waste:",endTime-startTime
测试结果如下:
与百度查出的IP信息进行对比:
还挺不错的,对吧。
标签:
原文地址:http://blog.csdn.net/rongyongfeikai2/article/details/51367906