爬取所有校园新闻

时间：2017-10-12 12:08:49 阅读：111 评论：0 收藏：0 [点我收藏+]

标签：标题 groups none arc img gen day blog 工作

获取单条新闻的#标题#链接#时间#来源#内容 #点击次数，并包装成一个函数。
获取一个新闻列表页的所有新闻的上述详情，并包装成一个函数。
获取所有新闻列表页的网址，调用上述函数。


import requests
from bs4 import BeautifulSoup
import re

url_main="http://news.gzcc.cn/html/xiaoyuanxinwen/"
res = requests.get(url_main)
res.encoding = ‘utf-8‘

soup = BeautifulSoup(res.text,‘html.parser‘)
li = soup.select(‘li‘)

#获取一个新闻列表页的所有新闻的上述详情，并包装成一个函数

def gethits(url_1):
    li_id =re.search(‘_.*/(.*).html‘,url_1).groups(0)[0]
    hits = requests.get(‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘.format(li_id)).text.split(‘.‘)[-1].rstrip(‘‘‘‘);‘‘‘).lstrip(‘‘‘‘html(‘‘‘)
    return hits

#获取单条新闻的#标题#链接#时间#来源#正文 #点击次数，并包装成一个函数。
def getpageinfo(label):
    for title_list in label:
        if len(title_list.select(‘.news-list-title‘))>0:
            href = title_list.select(‘a‘)[0][‘href‘]
            title = title_list.select(‘.news-list-title‘)[0].text
            time = title_list.select(‘span‘)[0].text
            info = title_list.select(‘span‘)[1].text

            res_list = requests.get(href)
            res_list.encoding = ‘utf-8‘
            soup_list = BeautifulSoup(res_list.text,‘html.parser‘)
            zhengwen = soup_list.select(‘.show-content‘)[0].text

            hits_list = gethits(href)

            print(‘时间：‘,time,‘\n标题：‘,title,‘\n链接：‘,href,‘\n来源：‘,info,‘\n点击次数：‘,hits_list,‘\n‘)
            print(‘正文：‘,zhengwen)
            break

getpageinfo(li)

技术分享

4.完成所有校园新闻的爬取工作。

import requests
from bs4 import BeautifulSoup
import re

mt="http://news.gzcc.cn/html/xiaoyuanxinwen/"
res=requests.get(mt)
res.encoding=‘utf-8‘
soup=BeautifulSoup(res.text,"html.parser")

def getdate(newurl):
    id=re.search(‘_(.*).html‘,newurl).group(1).split(‘/‘)[1]
    dijiurl=‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘.format(id)
    diji=requests.get(‘http://oa.gzcc.cn/api.php?op=count&id=8301&modelid=80‘).text.split(‘.‘)[-1].lstrip("html(‘").rstrip("html‘);")
    return date


def genonepage(listurl):
    res=requests.get(listurl)
    res.encoding=‘utf-8‘
    soup=BeautifulSoup(res.text,"html.parser")

for news in soup.select(‘li‘):
    if len(news.select(‘.news-list-title‘))>0:
        
        sorce=(news.select(‘.news-list-info‘)[0].contents[1].text) 
         day=(news.select(‘.news-list-info‘)[0].contents[0].text) 
        url=news.select(‘a‘)[0][‘href‘]       
        title=(news.select(‘.news-list-title‘)[0].text)
   
        resd=requests.get(url)
        resd.encoding=‘utf-8‘
        soupd=BeautifulSoup(resd.text,"html.parser")
        textd=(soupd.select(‘.show-content‘)[0].text)

        date=getdate(url)
        print(day,title,sorce,date)
        
gzccurl=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘
res=requests.get(gzccurl)
res.encoding=‘utf-8‘
soup=BeautifulSoup(res.text,"html.parser")

n=int(soup.select(‘.a1‘)[0].text.rstrip(‘条‘))
page=n//10+1
for i in range(1,7):
    pageurl=‘http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html‘.format(i)
    print(pageurl)

技术分享

爬取所有校园新闻

标签：标题 groups none arc img gen day blog 工作

原文地址：http://www.cnblogs.com/a939833950/p/7655369.html