码迷,mamicode.com
首页 > 编程语言 > 详细

python爬虫第一课,制作搜索引擎

时间:2015-05-25 13:07:15      阅读:275      评论:0      收藏:0      [点我收藏+]

标签:搜索引擎   爬虫   python   数据库   sqlite   

from BeautifulSoup import *
from urlparse import urljoin

ignaorewords=set(['the','of','to','and','a','in','is','it'])

我们的搜索引擎基于关键词, 所以将连词,冠词忽略


下面的代码是爬虫, 将网页的文本数据存储到我们的sqlite中, 大家看不懂也没有关系, 知道这些函数是干什么的就行了

from sqlite3 import dbapi2 as sqlite
import urllib2
class crawler:
    def __init__(self,dbname):
        self.con=sqlite.connect(dbname)
        #连接并建立数据库, dbname 随意, 'xxx.db'就可以
    def __del__(self):
        self.con.close()
    def dbcommit(self):
        self.con.commit()
    
    def getentryid(self,table,field,value,createnew=True):
        cur=self.con.execute(
            "select rowid from %s where %s='%s'" %(table,field,value))
        res=cur.fetchone()
        if res==None:
            cur=self.con.execute(
                "insert into %s (%s) values ('%s')" % (table,field,value))
            return cur.lastrowid
        else:
            return res[0]
    
    
    def addtoindex(self,url,soup):
        if self.isindexed(url): return
        print 'Indexing',url
        
        #Get words
        text=self.gettextonly(soup)
        words=self.separatewords(text)
        
        #Get URL id
        urlid=self.getentryid('urllist','url',url)
        
        # Link word to url
        for i in range(len(words)):
            word=words[i]
            if word in ignaorewords: continue
            wordid=self.getentryid('wordlist','word',word)
            self.con.execute("insert into wordlocation(urlid,wordid,location)             values(%d,%d,%d)" % (urlid,wordid,i))
            
            
    
    def gettextonly(self,soup):
        v=soup.string
        if v==None:
            c=soup.contents
            resulttext=''
            for t in c:
                subtext=self.gettextonly(t)
                resulttext+=subtext+'\n'
            return resulttext
        else:
            return v.strip()
    
    def separatewords(self,text):
        splitter=re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if s!='']
    
    def isindexed(self,url):
        u=self.con.execute(
            "select rowid from urllist where url='%s'" % url).fetchone()
        if u!=None:
            #if crawled
            v=self.con.execute(
                'select * from wordlocation where urlid=%d' % u[0]).fetchone()
            if v != None: return True
        return False
    
    def addlinkref(self,urlFrom,urlTo,linkText):
        pass
    
    def crawl(self,pages,depth=2):
        for i in range(depth):
            newpages=set()
            for page in pages:
                try:
                    c=urllib2.urlopen(page)
                except:
                    print "Could not open",page
                    continue
                soup=BeautifulSoup(c.read())
                self.addtoindex(page,soup)
                
                links=soup('a')
                for link in links:
                    if 'href' in dict(link.attrs):
                        url=urljoin(page,link['href'])
                        if url.find("'") != -1:
                            continue
                        url=url.split('#')[0] #remove location portion
                        if url[0:4]=='http' and not self.isindexed(url):
                            newpages.add(url)
                            linkText=self.gettextonly(link)
                            self.addlinkref(page,url,linkText)
                self.dbcommit()
            pages=newpages
            
    def createindextables(self):
        self.con.execute('create table urllist(url)')
        self.con.execute('create table wordlist(word)')
        self.con.execute('create table wordlocation(urlid,wordid,location)')
        self.con.execute('create table link(fromid integer,toid integer)')
        self.con.execute('create table linkwords(wordid,linid)')
        self.con.execute('create index wordidx on wordlist(word)')
        self.con.execute('create index urlidx on urllist(url)')
        self.con.execute('create index wordurlidx on wordlocation(wordid)')
        self.con.execute('create index urltoidx on link(toid)')
        self.con.execute('create index urlfromidx on link(fromid)')
        self.dbcommit()
好了, 有了爬虫, 我们再将需要爬取的页面写出来

pagelist=[['http://en.xjtu.edu.cn/'],
          ['http://www.lib.xjtu.edu.cn/'],
          ['http://en.wikipedia.org/wiki/Xi%27an_Jiaotong_University']]
建立数据库

mycrawler=crawler('searchindex.db')
mycrawler.createindextables()
爬取

mycrawler.crawl(pagelist[0])
搜索引擎

class searcher:
    def __init__(self,dbname):
        self.con=sqlite.connect(dbname)
    
    def __del__(self):
        self.con.close()
    
    def getmatchrows(self,q):
        # Strings to build the query
        fieldlist='w0.urlid'
        tablelist=''  
        clauselist=''
        wordids=[]

        # Split the words by spaces
        words=q.split(' ')  
        tablenumber=0

        for word in words:
            #Get the word ID
            wordrow=self.con.execute(
                "select rowid from wordlist where word='%s'" % word).fetchone()
            if wordrow!=None:
                wordid=wordrow[0]
                wordids.append(wordid)
                if tablenumber>0:
                    tablelist+=','
                    clauselist+=' and '
                    clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber)
                fieldlist+=',w%d.location' % tablenumber
                tablelist+='wordlocation w%d' % tablenumber      
                clauselist+='w%d.wordid=%d' % (tablenumber,wordid)
                tablenumber+=1

        # Create the query from the separate parts
        fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist)
        print fullquery
        cur=self.con.execute(fullquery)
        rows=[row for row in cur]

        return rows,wordids
    
    def geturlname(self,id):
        return self.con.execute(
            "select url from urllist where rowid=%d" % id).fetchone()[0]
    
    def normaliszescores(self,scores,smallIsBetter=0):
        vsmall=0.00001
        if smallIsBetter:
            minscore=min(scores.value())
            return dict([(u,float(minscore)/max(vsmall,l)) for (u,l)                        in scores.items()])
        else:
            maxscore=max(scores.values())
            if maxscore==0:
                maxscore=vsmall
            return dict([(u,float(c)/maxscore) for (u,c) in scores.items()])

#score methods
    def frequencyscore(self,rows):
        counts=dict([(row[0],0) for row in rows])
        for row in rows:
            counts[row[0]]+=1
        return self.normaliszescores(counts)
    
    def locationscore(self,rows):
        locations=dict([(row[0],1000000) for row in rows])
        for row in rows:
            loc=sum(row[1:])
            if loc<locations[row[0]]:
                locations[row[0]]=loc
        return self.normaliszescores(locations,smallIsBetter=1)
    
    def distancescore(self,rows):
        if len(row[0])<=2:
            return dict([(row[0],1.0) for row in rows])
        mindistance=dict([(row[0],1000000) for row in rows])
        for row in rows:
            dist=sum([abs(row[i]-row[i-1]) for i in range(2,len(row))])
            if dist < mindistance[row[0]]:
                mindistance[row[0]]=dist
        return self.normaliszescores(mindistance,smallIsBetter=1)
#---------------------------------------------------------------------------

    
    def getscoredlist(self,rows,wordids):
        totalscores=dict([(row[0],0) for row in rows])
        
        weights=[(1.0,self.frequencyscore(rows))]
        
        for (weight,scores) in weights:
            for url in totalscores:
                totalscores[url]+=weight*scores[url]
        return totalscores
    
    def query(self,q):
        rows,wordids=self.getmatchrows(q)
        scores=self.getscoredlist(rows,wordids)
        rankedscores=sorted([(score,url) for (url,score) in scores.items()],reverse=1)
        for (score,urlid) in rankedscores[:10]:
            print '%f\t%s' % (score,self.geturlname(urlid))
建立搜索引擎与数据库的关联

e=searcher('searchindex.db')
搜索

e.query('xjtu college')
这样你的第一个搜索引擎就搭建完毕啦:

1.000000	http://en.xjtu.edu.cn/XJTU_Introduction/Introduction.htm
0.941176	http://en.xjtu.edu.cn/info/1044/1683.htm
0.705882	http://en.xjtu.edu.cn/Schools_and_Colleges.htm
0.529412	http://en.xjtu.edu.cn/info/1044/1681.htm
0.470588	http://en.xjtu.edu.cn/Education/Undergraduate_Education.htm
0.382353	http://en.xjtu.edu.cn/XJTU_News/News.htm
0.382353	http://en.xjtu.edu.cn/Campus_Life/Student_Bodies.htm
0.294118	http://en.xjtu.edu.cn/XJTU_News/Teaching_and_learning.htm
0.294118	http://en.xjtu.edu.cn/info/1044/1572.htm
0.279412	http://en.xjtu.edu.cn/info/1044/1571.htm









python爬虫第一课,制作搜索引擎

标签:搜索引擎   爬虫   python   数据库   sqlite   

原文地址:http://blog.csdn.net/qq_21970857/article/details/45967635

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!