码迷,mamicode.com
首页 > 数据库 > 详细

使用bs4对海投网内容信息进行提取并存入mongodb数据库

时间:2015-09-29 18:47:43      阅读:190      评论:0      收藏:0      [点我收藏+]

标签:

 example:    http://xyzp.haitou.cc/article/722427.html

首先是直接下载好每个页面,可以使用 os.system( "wget "+str(url))  或者urllib2.urlopen(url) ,很简单不赘述。

然后,重头戏,进行信息抽取:

#!/usr/bin/env python
# coding=utf-8

from bs4 import BeautifulSoup
import codecs
import sys
import os
reload(sys)
sys.setdefaultencoding("utf-8")
import re

from pymongo import MongoClient

def get_jdstr(fname):
    soup = ""
    retdict = {}
    with open(fname) as fr:
        soup = BeautifulSoup(fr.read().replace("","))
    
    jdstr = soup.get_text()
    
    retdict["inc_name"] = soup.title.string.split()[0]
    retdict["page_content"] = soup.find_all("div","panel-body panel-body-text")[0].get_text()
    retdict["index_url"] = re.search("http://xyzp.haitou.cc/article/\d+.html",jdstr).group()
    retdict["info_from"] = soup.find_all("p","text-ellipsis")[0].contents[1].get_text()
    retdict["workplace"] = soup.find_all("p","text-ellipsis")[1].contents[1].get_text()
    retdict["info_tag"] = soup.find_all("p","text-ellipsis")[2].contents[1].get_text()
    retdict["pub_time"] = soup.find_all("p","text-ellipsis")[3].contents[1].get_text()

    return retdict



def JD_extr():
    fnames = [ fname  for fname in os.listdir("./") if fname.endswith(".html") ]
    fw = codecs.open("tmp_jd_haitou_clean.csv","w","utf-8")
    res = []
    for fname in fnames[1:500]:
        tmp = []
        retdict =  get_jdstr(fname)
        res.append(retdict)
        for k,v in retdict.iteritems():
            tmp.append(v)
        fw.write(" , ".join(tmp)+"\n")
        fw.write("==="*20+"\n") 
    print fname,"done!"
    return res



def change2html():
    fnames = [ fname for fname in os.listdir("./") if fname.endswith(".txt") ]
    for fname in fnames:
        cmd = "mv "+str(fname) +" "+fname[:-3]+"html"
        print cmd
        os.system(cmd)


def store2mongodb():
    client = MongoClient("localhost",27017)
    db = client.JD_Haitou
    
    documents = JD_extr()
    for d in documents:
        db.haitouJD.insert(d)

    mycol = db["haitouJD"]
    print mycol.count()



def split_jd_test_data(fname=./tmp_jd_haitou_clean.csv):
    fw = codecs.open(./split_jd_res.csv,w,utf-8)
    fr = codecs.open(fname,r,utf-8)
    indexurl = re.compile("http://xyzp.haitou.cc/article/\d+.html")
    for line in fr: 
        if indexurl.search(line):
            url = indexurl.search(line).group()
            cnt = 1  #默认为1
            fw.write(url+"\t"+cnt+"\n")
    fr.close()
    fw.close()




if __name__ == "__main__":
   JD_extr()  # 抽取后存入文件
    store2mongodb()
    split_jd_test_data()
    print "done"

 

使用bs4对海投网内容信息进行提取并存入mongodb数据库

标签:

原文地址:http://www.cnblogs.com/jkmiao/p/4846799.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!