码迷,mamicode.com
首页 > 编程语言 > 详细

python网络爬虫(14)博客园用户信息爬取

时间:2019-07-21 01:50:16      阅读:154      评论:0      收藏:0      [点我收藏+]

标签:arch   日期   cut   digest   imp   spl   RoCE   varchar   方案   

说明

这里只放代码,方案技术没有更变

代码说明

需要cookies绕过登录,使用selenium在Firefox下模拟。需要安装geck...?插件,另外,数据存储在sqlite,需要安装。

Spider.py

import HtmlDownloader
import HtmlParser
import DataOutput
import UrlManager
import re
from selenium import webdriver
class Spider(object):
    def __init__(self):
        self.downloader=HtmlDownloader.HtmlDownloader()
        self.parser=HtmlParser.HtmlParser()
        self.output=DataOutput.DataOutput()
        self.urlManager=UrlManager.UrlManager()
        self.driver=webdriver.Firefox()
     
    def crawl(self,root_url):
        content=self.downloader.download_root(root_url,self.driver)
        urls=self.parser.parser_url(content)
        self.urlManager.add_urls(urls)
        i=0
        while self.urlManager.new_urls_size()>0 and self.urlManager.old_urls_size()<2000:
            url=self.urlManager.get_new_url()
            i=i+1
            print(str(i)+‘:‘+str(url))
            pattern=re.compile(‘/.*?/‘)
            user_name=re.findall(pattern,url)
            url=‘https://home.cnblogs.com‘+user_name[1]
            
            content=self.downloader.download(self.driver,url)
            new_urls=self.parser.parser_url(content)
            self.urlManager.add_urls(new_urls)
            
            try:
                content=self.parser.parser_data(self.driver)
                self.output.store_data(content)
            except:
                i=i-1
                print(‘error url may not exits:‘+self.driver.current_url)
        self.output.output_end()
        self.urlManager.save_status()
        #self.driver.close()
        print(‘ed‘)
if __name__==‘__main__‘:
    spider=Spider()
    spider.crawl(‘https://www.cnblogs.com/‘)
    
    
    

UrlManager.py

import pickle
import hashlib
import re
class UrlManager():
    def __init__(self):
        self.old_urls=self.load_process(‘new_urls‘)
        self.new_urls=self.load_process(‘old_urls‘)
        
    def load_process(self,file_name):
        print(‘loading .‘)
        try:
            with open(file_name,‘rb‘) as f:
                tmp=pickle.load(f)
                return tmp
        except:
            print(‘file may not exist.will create it‘)
        new_set=set()
        self.save_process(file_name,new_set)
        return new_set
    
    def save_process(self,file_name,data):
        with open(file_name,‘wb‘) as f:
            pickle.dump(data,f)
    
    def save_status(self):
        self.save_process(‘new_urls‘,self.new_urls)
        self.save_process(‘old_urls‘,self.old_urls)
    
    def add_urls(self,urls):
        for url in urls:
            m=hashlib.md5()
            m.update(url.encode(‘utf8‘))
            url_md5=m.hexdigest()[8:-8]
            if url not in self.new_urls and url_md5 not in self.old_urls:
                self.new_urls.add(url)
            
    def get_new_url(self):
        new_url=self.new_urls.pop()
        m=hashlib.md5()
        m.update(new_url.encode(‘utf8‘))
        url_md5=m.hexdigest()[8:-8]
        self.old_urls.add(url_md5)
        return new_url
    
    def new_urls_size(self):
        return len(self.new_urls)
    
    def old_urls_size(self):
        return len(self.old_urls)
    
    
    
    

HtmlParser.py

import re
import json
class HtmlParser(object):
    def parser_url(self,content):
        pattern=re.compile(u‘https://www.cnblogs.com/\w*/‘)
        all_urls=re.findall(pattern,content)
        all_urls=list(set(all_urls))
        return all_urls
    
    def parser_data(self,driver):
        dict={}
        user_id=driver.find_element_by_class_name(‘display_name‘).text
        all_message=driver.find_element_by_class_name(‘user_profile‘).text
        all_message=all_message.split(‘\n‘)
        all_message.insert(0,‘用户ID:‘+user_id+‘\n‘)
        switch={‘用户ID‘:‘user_id‘,
                ‘姓名‘:‘name‘,
                ‘性别‘:‘sex‘,
                ‘出生日期‘:‘birth_day‘,
                ‘家乡‘:‘hometown‘,
                ‘现居住地‘:‘live_place‘,
                ‘单位‘:‘work_for‘,
                ‘工作状况‘:‘job_status‘,
                ‘感兴趣的技术‘:‘interest_technology‘,
                ‘最近目标‘:‘recent_goal‘,
                ‘座右铭‘:‘mark_words‘,
                ‘自我介绍‘:‘introduce‘,
                ‘园龄‘:‘blog_age‘,
                ‘博客‘:‘blog_address‘,
                ‘婚姻‘:‘marriage‘,
                ‘职位‘:‘position‘,
                ‘QQ‘:‘qq‘,
                ‘Email‘:‘email‘
            }
        key=‘‘
        value=‘‘
        for each in all_message:
            try:
                each=each.replace(‘\n‘,‘‘)
                key=switch[each.split(‘:‘)[0]]
                value=each.split(‘:‘)[1]
                dict[key]=value
            except:
                print(‘split error:‘+each+‘auto fixed..‘)
                value=value+each
                dict[key]=value
                print(dict)
        return dict
    
    

HtmlDownloader.py

import json
class HtmlDownloader(object):
    def download_root(self,url,driver):
        driver.get(url)
        with open(‘cookies.json‘, ‘r‘, encoding=‘utf-8‘) as f:
            listCookies = json.loads(f.read())
        for cookie in listCookies:
            driver.add_cookie({
                ‘domain‘: cookie[‘domain‘],  # 此处xxx.com前,需要带点
                ‘name‘: cookie[‘name‘],
                ‘value‘: cookie[‘value‘]
            })
        driver.refresh()
        return driver.page_source
    
    def download(self,driver,url):
        driver.get(url)
        return driver.page_source

DataOutput.py

import sqlite3
class DataOutput(object):
    def __init__(self):
        self.cx=sqlite3.connect("cnblog.db")
        self.table_name=‘cnblog‘
        self.create_table()
     
    def create_table(self):
        values=‘‘‘
        id integer primary key autoincrement,
        user_id varchar(50) not null,
        name varchar(50),
        sex varchar(6),
        birth_day varchar(30),
        hometown varchar(50),
        live_place varchar(50),
        marriage varchar(20),
        position varchar(30),
        work_for varchar(50),
        job_status varchar(20),
        interest_technology varchar(200),
        recent_goal varchar(500),
        mark_words varchar(500),
        introduce varchar(500),
        blog_age varchar(30),
        blog_address varchar(100),
        qq varchar(15),
        email varchar(30)
        ‘‘‘
        self.cx.execute(‘create table if not exists %s(%s)‘ %(self.table_name,values))
         
    def store_data(self,data):
        flag=0
        user_id=‘‘
        for key,value in data.items():
            if flag==0:
                cmd="insert into %s (%s) values (‘%s‘)" %(self.table_name,key,value)
                user_id=value
                flag=1
            else:
                cmd=‘update %s set %s="%s" where user_id="%s"‘ %(self.table_name,key,value,user_id)
            self.cx.execute(cmd)
        self.cx.commit()
         
    def output_end(self):
        self.cx.close()
        
        

 

python网络爬虫(14)博客园用户信息爬取

标签:arch   日期   cut   digest   imp   spl   RoCE   varchar   方案   

原文地址:https://www.cnblogs.com/bai2018/p/11219819.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!