码迷,mamicode.com
首页 > 其他好文 > 详细

爬取校园新闻首页的新闻的详情,使用正则表达式,函数抽离

时间:2018-04-08 22:41:07      阅读:229      评论:0      收藏:0      [点我收藏+]

标签:tail   html   encoding   group   title   inf   split()   .gz   rap   

import requests
import re

url = "http://news.gzcc.cn/html/xiaoyuanxinwen/"
res = requests.get(url)
res.encoding = utf-8

# 利用BeautifulSoup的HTML解析器,生成结构树
from bs4 import BeautifulSoup

soup = BeautifulSoup(res.text, html.parser)


def getClickCount(url):
    HitUrl = http://oa.gzcc.cn/api.php?op=count&id=9183&modelid=80
    hitNumber = requests.get(HitUrl).text.split(.html)[-1].lstrip("(‘").rstrip("‘);")
    print("点击次数:", hitNumber)

    re.match(http://news.gzcc.cn/html/2018/xiaoyuanxinwen(.*).html, url).group(1).split(/)[1]
    print(新闻编号:, re.search(\_(.*).html, url).group(1))


def getNewDetail(url):
    res = requests.get(url)
    res.encoding = utf-8
    soup = BeautifulSoup(res.text, html.parser)

    for news in soup.select(li):
        if len(news.select(.news-list-title)) > 0:
            # 首页文章标题
            title = news.select(.news-list-title)[0].text
            # 首页文章描述
            description = news.select(.news-list-description)[0].text
            # 首页文章信息
            info = news.select(.news-list-info)[0].text
            # 首页文章链接
            href = news.select(a)[0][href]

            url = href
            res = requests.get(url)
            res.encoding = utf-8
            soup = BeautifulSoup(res.text, html.parser)

            # 获取每篇文章的信息
            newinfo = soup.select(.show-info)[0].text

            # 获取文章内容
            content = soup.select(#content)[0].text

            # 日期
            date = newinfo.split()[0]
            # 当日时间
            time = newinfo.split()[1]
            # 作者
            author = newinfo.split()[2]
            # 审核
            checker = newinfo.split()[3]
            # 来源
            source = newinfo.split()[4]
            # 摄影
            Photography = newinfo.split()[5]

            print(------------------------------------------------------------------------------)
            print("文章标题:" + title)
            print("\n文章描述:" + description)
            print("\n文章信息:\n" + date +   + time + \n + author + \n + checker + \n + source+ \n + Photography)
            getClickCount(href)#点击次数、新闻编号
            print("\n文章链接:" + href)
            print(content)
            print(------------------------------------------------------------------------------)


getNewDetail(url)

技术分享图片

爬取校园新闻首页的新闻的详情,使用正则表达式,函数抽离

标签:tail   html   encoding   group   title   inf   split()   .gz   rap   

原文地址:https://www.cnblogs.com/FZW1874402927/p/8747466.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!