码迷,mamicode.com
首页 > 编程语言 > 详细

qqzeng-ip.dat IP库读取python版

时间:2016-05-12 16:05:06      阅读:824      评论:0      收藏:0      [点我收藏+]

标签:

qqzeng-ip.dat是一个特殊格式的dat文件,可以快速的查找IP对应的地理位置信息。据作者测试的结果来看,是100万ip查找速度0.5秒。

当然这和语言有非常大的关系,python的循环性能一直是为人所诟病的。目前python版本测试的结果是10万IP的查找速度是3.X秒左右,还算够用,毕竟真实情况下的30秒~5分钟内的日志不太可能出现一批数据中的不重复IP超过10万个。

作者提供了解析dat的java/c/php脚本,但没有提供python版本的。所以我就写了一个,以供需要用python语言读取ip的地理位置信息使用。

代码如下:

#coding:utf-8
import os
import math
import socket
import struct
import io
from io import SEEK_SET

path = os.path.normpath(os.path.dirname(os.path.abspath(__file__)) + "/qqzeng-ip-utf8.dat")

class IpSearch(object):
    fp = None
    firstStartIpOffset = None
    lastStartIpOffset = None
    preStartOffset = None
    preEndOffset = None
    ipCount = None
    prefixCount = None
    prefixList = dict()

    def __init__(self):
        self.fp = io.open(path,"rb")
        buff = self.fp.read(16)
        self.firstStartIpOffset = self.bytesToLong(buff[0],buff[1],buff[2],buff[3])
        self.lastStartIpOffset = self.bytesToLong(buff[4],buff[5],buff[6],buff[7])
        self.preStartOffset = self.bytesToLong(buff[8],buff[9],buff[10],buff[11])
        self.preEndOffset = self.bytesToLong(buff[12],buff[13],buff[14],buff[15])
        self.ipCount = (self.lastStartIpOffset - self.firstStartIpOffset) / 12 + 1
        self.prefixCount = (self.preEndOffset - self.preStartOffset) / 9 + 1
        
        self.fp.seek(self.preStartOffset,SEEK_SET)
        preBuff = self.fp.read(self.prefixCount*9)
        for k in range(0,self.prefixCount):
            i = k*9
            startIndex = self.bytesToLong(preBuff[1+i],preBuff[2+i],preBuff[3+i],preBuff[4+i])
            endIndex = self.bytesToLong(preBuff[5+i],preBuff[6+i],preBuff[7+i],preBuff[8+i])
            self.prefixList[ord(preBuff[i])] = {
                "start_index":startIndex,
                "end_index":endIndex
            }

    def __del__(self):
        if self.fp != None:
            self.fp.close()

    def get(self,ip):
        if ip == '':
            return ""

        high = 0
        low = 0
        startIp = 0
        endIp = 0
        localOffset = 0
        localLength = 0
        prefix = ip.split(".")[0]
        prefix = int(prefix)
        ipnum = self.ip2unit(ip)
        if prefix in self.prefixList.keys():
            index = self.prefixList[prefix]
            low = index["start_index"]
            high = index["end_index"]
        else:
            return ""

        left = low if low == high else self.binarySearch(low,high,ipnum)
        left,startIp,endIp,localOffset,localLength = self.getIndex(left,startIp,endIp,localOffset,localLength)
        if startIp <= ipnum and endIp >= ipnum:
            return self.getLocal(localOffset,localLength)
        else:
            return ""
    
    def getLocal(self,localOffset,localLength):
        self.fp.seek(localOffset,SEEK_SET)
        return self.fp.read(localLength)

    def getIndex(self,left,startIp,endIp,localOffset,localLength):
        leftOffset = self.firstStartIpOffset + left*12
        self.fp.seek(leftOffset,SEEK_SET)
        buff = self.fp.read(12)
        startIp = self.bytesToLong(buff[0],buff[1],buff[2],buff[3])
        endIp = self.bytesToLong(buff[4],buff[5],buff[6],buff[7])
        r3 = (ord(buff[8]) << 0 | ord(buff[9]) << 8 | ord(buff[10]) << 16)
        if r3 < 0:
            r3 += 4294967296
        localOffset = r3
        localLength = ord(buff[11])

        return [left,startIp,endIp,localOffset,localLength]

    def binarySearch(self,low,high,k):
        m = 0
        while low <= high:
            mid = (low + high)/2
            endIpNum = self.getEndIpNum(mid)
            if endIpNum >= k:
                m = mid
                if mid == 0:
                    break
                high = mid - 1
            else:
                low = mid + 1
        return m

    def getEndIpNum(self,left):
        leftOffset = self.firstStartIpOffset + (left*12) + 4
        self.fp.seek(leftOffset,SEEK_SET)
        buf = self.fp.read(4)
        return self.bytesToLong(buf[0],buf[1],buf[2],buf[3])

    def ip2unit(self,ip):
        lip = self.ip2long(ip)
        if lip < 0:
            lip += 4294967296
        return lip

    def ip2long(self,ip):
        packedIP = socket.inet_aton(ip)
        return struct.unpack("!L", packedIP)[0] 

    def bytesToLong(self,a,b,c,d):
        iplong = (ord(a) << 0) | (ord(b) << 8) | (ord(c) << 16) | (ord(d) << 24)
        if iplong < 0:
            iplong += 4294967296
        return iplong

if __name__ == '__main__':
    ipSearch = IpSearch()
    print ipSearch.get("210.51.200.123").decode("utf-8").encode("gbk")

    import time
    startTime = time.time()
    for i in range(0,100000):
        ipSearch.get("210.51.200.123")
    endTime = time.time()
    print "time waste:",endTime-startTime


测试结果如下:

技术分享

与百度查出的IP信息进行对比:

技术分享

还挺不错的,对吧。

qqzeng-ip.dat IP库读取python版

标签:

原文地址:http://blog.csdn.net/rongyongfeikai2/article/details/51367906

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!