码迷,mamicode.com
首页 > 其他好文 > 详细

数据结构化与保存

时间:2018-04-12 20:55:59      阅读:151      评论:0      收藏:0      [点我收藏+]

标签:学院   列表   das   sele   soup   pandas   bs4   冒号   des   

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import pandas

news_list = []


def crawlOnePageSchoolNews(page_url):
    res0 = requests.get(page_url)
    res0.encoding = UTF-8
    soup0 = BeautifulSoup(res0.text, html.parser)
    news = soup0.select(.news-list > li)

    for n in news:
        # print(n)
        print(** * 5 + 列表页信息 + ** * 10)
        print(新闻链接: + n.a.attrs[href])
        print(新闻标题: + n.select(.news-list-title)[0].text)
        print(新闻描述: + n.a.select(.news-list-description)[0].text)
        print(新闻时间: + n.a.select(.news-list-info > span)[0].text)
        print(新闻来源: + n.a.select(.news-list-info > span)[1].text)
        news = getNewDetail(n.a.attrs[href])
        news[标题] = n.select(.news-list-title)[0].text
        news_list.append(news)
    return news_list


def getNewDetail(href):
    print(** * 5 + 详情页信息 + ** * 10)
    print(href)
    res1 = requests.get(href)
    res1.encoding = UTF-8
    soup1 = BeautifulSoup(res1.text, html.parser)
    news = {}
    if soup1.select(#content):
        news_content = soup1.select(#content)[0].text
        news[内容] = news_content
        print(news_content)  # 文章内容
    else:
        news[内容] = ‘‘
    if soup1.select(.show-info):  # 防止之前网页没有show_info
        news_info = soup1.select(.show-info)[0].text
    else:
        return news
    info_list = [来源, 发布时间, 点击, 作者, 审核, 摄影]  # 需要解析的字段
    news_info_set = set(news_info.split(\xa0)) - { , ‘‘}  # 网页中的 获取后会解析成\xa0,所以可以使用\xa0作为分隔符
    # 循环打印文章信息
    for n_i in news_info_set:
        for info_flag in info_list:
            if n_i.find(info_flag) != -1:  # 因为时间的冒号采用了英文符所以要进行判断
                if info_flag == 发布时间:
                    # 将发布时间字符串转为datetime格式,方便日后存储到数据库
                    release_time = datetime.strptime(n_i[n_i.index(:) + 1:], %Y-%m-%d %H:%M:%S )
                    news[info_flag] = release_time
                    print(info_flag + :, release_time)
                elif info_flag == 点击:  # 点击次数是通过文章id访问php后使用js写入,所以这里单独处理
                    news[info_flag] = getClickCount(href)
                else:
                    news[info_flag] = n_i[n_i.index() + 1:]
                    print(info_flag + : + n_i[n_i.index() + 1:])
    print(———— * 40)
    return news


def getClickCount(news_url):
    click_num_url = http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80
    click_num_url = click_num_url.format(re.search(_(.*)/(.*).html, news_url).group(2))
    res2 = requests.get(click_num_url)
    res2.encoding = UTF-8
    click_num = re.search("\$\(‘#hits‘\).html\(‘(\d*)‘\)", res2.text).group(1)
    print(点击: + click_num)
    return click_num


print(crawlOnePageSchoolNews(http://news.gzcc.cn/html/xiaoyuanxinwen/))

pageURL = http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html
res = requests.get(http://news.gzcc.cn/html/xiaoyuanxinwen/)
res.encoding = UTF-8
soup = BeautifulSoup(res.text, html.parser)
newsSum = int(re.search((\d*)条, soup.select(a.a1)[0].text).group(1))
if newsSum % 10:
    pageSum = int(newsSum / 10) + 1
else:
    pageSum = int(newsSum / 10)

for i in range(2, pageSum + 1):
    crawlOnePageSchoolNews(pageURL.format(i))


dit = pandas.DataFrame(news_list)
dit.to_excel(test.xlsx)
dit.to_csv(test.csv)

print(dit[[作者, 来源]][:6])
print(dit[(dit[来源] == 学校综合办) & (dit[点击] > 3000)])
print(dit[dit[来源].isin([国际学院, 学生工作处])])

 

数据结构化与保存

标签:学院   列表   das   sele   soup   pandas   bs4   冒号   des   

原文地址:https://www.cnblogs.com/onlythisone/p/8810144.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!