码迷,mamicode.com
首页 > 编程语言 > 详细

python爬虫如何爬知乎的话题?

时间:2018-01-05 19:54:54      阅读:228      评论:0      收藏:0      [点我收藏+]

标签:inpu   .com   lib   name   result   host   python   user   127.0.0.1   

因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用

#coding:utf-8
from fileinput import filename
__author__ = ‘haoning‘
__crawler for http://www.guandn.com/

#!/usr/bin/env python

import urllib
import urllib2
import time
import re
import json
import uuid
import platform
import os
import sys
import cookielib
import MySQLdb as mdb
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding( "utf-8" )

headers = {
   ‘User-Agent‘ : ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0‘,
   ‘Content-Type‘:‘application/x-www-form-urlencoded; charset=UTF-8‘,
   ‘X-Requested-With‘:‘XMLHttpRequest‘,
   ‘Referer‘:‘https://www.zhihu.com/topics‘,
   ‘Cookie‘:‘__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a‘
}

DB_HOST = ‘127.0.0.1‘
DB_USER = ‘root‘
DB_PASS = ‘root‘

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, ‘zhihu‘, charset=‘utf8‘)
conn.autocommit(False)
curr = conn.cursor()

def get_html(url):
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req,None,20) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞?
        html = response.read()
        return html
    except:
        print "timeout"
    return None

def getTopics():
    url = ‘https://www.zhihu.com/topics‘
    print url
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞?
        html = response.read().decode(‘utf-8‘)
        print html
        soup = BeautifulSoup(html)
        lis = soup.find_all(‘li‘, {‘class‘ : ‘zm-topic-cat-item‘})
        
        for li in lis:
            data_id=li.get(‘data-id‘)
            name=li.text
            curr.execute(‘select id from classify_new where name=%s‘,(name))
            y= curr.fetchone()
            if not y:
                curr.execute(‘INSERT INTO classify_new(data_id,name)VALUES(%s,%s)‘,(data_id,name))
        conn.commit()
    except Exception as e:
        print "get topic error",e
        

def get_extension(name):  
    where=name.rfind(‘.‘)
    if where!=-1:
        return name[where:len(name)]
    return None


def which_platform():
    sys_str = platform.system()
    return sys_str

def GetDateString():
    when=time.strftime(‘%Y-%m-%d‘,time.localtime(time.time()))
    foldername = str(when)
    return foldername 

def makeDateFolder(par,classify):
    try:
        if os.path.isdir(par):
            newFolderName=par + ‘//‘ + GetDateString() + ‘//‘  +classify
            if which_platform()=="Linux":
                newFolderName=par + ‘/‘ + GetDateString() + "/" +classify
            if not os.path.isdir( newFolderName ):
                os.makedirs( newFolderName )
            return newFolderName
        else:
            return None 
    except Exception,e:
        print "kk",e
    return None 

def download_img(url,classify):
    try:
        extention=get_extension(url)
        if(extention is None):
            return None
        req = urllib2.Request(url)
        resp = urllib2.urlopen(req,None,15)
        dataimg=resp.read()
        name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
        top="E://topic_pic"
        folder=makeDateFolder(top, classify)
        filename=None
        if folder is not None:
            filename  =folder+"//"+name
        #print "filename",filename
        try:
            if "e82bab09c_xs" not in str(url):
                if not os.path.exists(filename):
                    file_object = open(filename,‘w+b‘)
                    file_object.write(dataimg)
                    file_object.close()
                    return GetDateString()+‘/‘+classify+"/"+name
                else:
                    print "file exist"
                    return None
        except IOError,e1:
            print "e1=",e1
            pass
    except Exception as e:
        print "eee",e
        pass
    return None #濡傛灉娌℃湁涓嬭浇涓嬫潵灏卞埄鐢ㄥ師鏉ョ綉绔欑殑閾炬帴
    
                
def get_topis(top_id,topic_name):
    url = ‘https://www.zhihu.com/node/TopicsPlazzaListV2‘
    isGet = True;
    offset = -20;
    top_id=str(top_id)
    while isGet:
        offset = offset + 20
        values = {‘method‘: ‘next‘, ‘params‘: ‘{"topic_id":‘+top_id+‘,"offset":‘+str(offset)+‘,"hash_id":""}‘}
        try:
            data = urllib.urlencode(values)
            request = urllib2.Request(url,data,headers)
            response = urllib2.urlopen(request)
            html=response.read().decode(‘utf-8‘)
            if html is None:
                return
            json_str = json.loads(html)
            ms=json_str[‘msg‘]
            if len(ms) <5:
                break
            msg=ms[0]
            #print msg
            soup = BeautifulSoup(str(msg))
            blks = soup.find_all(‘div‘, {‘class‘ : ‘blk‘})
            for blk in blks:
                page=blk.find(‘a‘).get(‘href‘)
                if page is not None:
                    node=page.replace("/topic/","")
                    print node,page
        except urllib2.URLError, e:
            print "error is",e
            pass
                

def work():
    #getTopics() #鑾峰緱璇濋
    curr.execute(‘select data_id,name from classify_new‘)
    results = curr.fetchall()
    for r in results:
        data_id=r[0]
        name=r[1]
        get_topis(data_id,name)
        
if __name__ == ‘__main__‘:
    i=0
    while i< 40:
        work()
        i=i+1

  

说下数据库的问题,我这里就不传附件了,看字段自己建立,因为这确实太简单了,我是用的mysql,你看自己的需求自己建。

有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新qq群号,这里不留qq号啥的,以免被系统给K了。

python爬虫如何爬知乎的话题?

标签:inpu   .com   lib   name   result   host   python   user   127.0.0.1   

原文地址:https://www.cnblogs.com/huangxie/p/8206460.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!