数据结构化与保存

时间：2018-04-16 20:40:49 阅读：176 评论：0 收藏：0 [点我收藏+]

标签：import for 保存文章 global \n ptime data 结构化

import requests
from bs4 import BeautifulSoup
import string
import time
import datetime
import re
import pandas



#获取文章详情
def getNewDetail(d,hist):
    print("详情：")
    rlink=requests.get(d,headers=head)
    rlink.encoding=‘utf-8‘
    #print(rlink.text)
    soup=BeautifulSoup(rlink.text,‘html.parser‘)
    e=soup.select(".show-info")[0].text
    print()
    return(newsDetailItems(e,hist,soup))
   

#将获取的文章详情存储为字典
def newsDetailItems(e,hist,soup):
    news={}

    #时间类型转换
    dt=e.lstrip(‘发布时间:‘)[:19]
    news[‘时间‘]= datetime.datetime.strptime(dt,‘%Y-%m-%d %H:%M:%S‘)

    #作者
    i=e.find(‘作者：‘)
    if i>0:
        news[‘作者‘]=e[e.find(‘作者：‘):].split()[0].lstrip(‘作者：‘)

    else:
        news[‘作者‘]=‘未知‘

    #审核
    i=e.find(‘审核：‘)
    if i>0:
        news[‘审核‘]=e[e.find(‘审核：‘):].split()[0].lstrip(‘审核：‘)

    else:
        news[‘审核‘]=‘未知‘

    #来源
    i=e.find(‘来源：‘)
    if i>0:
        news[‘来源‘]=e[e.find(‘来源：‘):].split()[0].lstrip(‘来源：‘)

    else:
        news[‘来源‘]=‘未知‘


    #摄影
    i=e.find(‘摄影：‘)
    if i>0:
        news[‘摄影‘]=e[e.find(‘摄影：‘):].split()[0].lstrip(‘摄影：‘)

    else:
        news[‘摄影‘]=‘未知‘

    #点击次数
    i=e.find(‘点击：‘)
    if i>0:
        news[‘点击‘]=hist

    
    news[‘新闻内容：‘]=soup.select("#content")[0].text
    #print(news[‘新闻内容：‘])

    for pn in range(5):
        print()
    return(news)

#点击次数
def getClickCount(d):
        #获取新闻编号
        r2=re.findall(‘\_\d+\/(.*?)\.‘,d,re.S)
        #print(r2)
        r1=‘http://oa.gzcc.cn/api.php?op=count&id=‘
        r3=‘&modelid=80‘
        r22="".join(r2)

        #生成点击次数的URL
        r_all=r1+r22+r3
        #print(r_all)
        rlink2=requests.get(r_all,headers=head)

        #获取点击次数
        hist=rlink2.text.split(‘.html‘)[-1].lstrip("(‘)").rstrip("‘);")
        return hist

#计算新闻数量
def newscounter(counter):
    counter=counter+1
    return counter

#获取新闻列表页的全部新闻(将新闻数据结构化为字典的列表)
def getListPagel(r,counter):

    
    if counter==0:
        countertemp=counter

    soup=BeautifulSoup(r.text,‘html.parser‘)
    #存概述的列表
    newslist1=[]

    #存详情的列表
    newslist2=[]
    aso=soup.select(‘li‘)
    for i in soup.select(‘li‘):
        news1={}
        if len(i.select(".news-list-title"))>0:
            a=i.select(".news-list-title")[0].text
            news1["标题"]=i.select(".news-list-title")[0].text
            news1["时间"]=i.select(".news-list-info")[0].contents[0].text
            news1["来源"]=i.select(".news-list-info")[0].contents[1].text
            news1["链接"]=i.select("a")[0].attrs[‘href‘]
            news1["内容概述"]=i.select(".news-list-description")[0].text
            news1["点击"]=getClickCount(news1["链接"])+‘ 次‘

            #新闻数
            countertemp=newscounter(counter)
            counter=countertemp
            print("已获取新闻数："+str(countertemp))
            print()

            showone="标题："+news1["标题"]+‘\n‘+"时间："+news1["时间"]+‘\n‘+"来源："+news1["来源"]+‘\n‘+"链接："+news1["链接"]+‘\n‘+"点击："+news1["点击"]+‘\n‘+"内容概述："+‘\n‘+news1["内容概述"]+‘\n\n‘
            #print(showone)
            file_handle.write(showone)

            print()

            #概述
            newslist1.append(news1)

            #详情
            newslist2.append(getNewDetail(news1["链接"],news1["点击"]))
            
    return counter,newslist1,newslist2



#爬虫伪装
head = {}
head[‘user-agent‘]=‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36‘

##打开文件
#file_handle=open(‘1.txt‘,mode=‘w‘)

#用于li标签数量(用于判断)
temp=1

#列表页
i=230

#新闻数量计数器
global counter
counter=0

#清空文本文件
file_handle=open(‘1.txt‘,mode=‘w‘)
file_handle.truncate()
file_handle.close()

#存储概述的列表
newslist1=[]

#存储详情的列表
newslist2=[]

#ID"news-list-title"存在即进行循环，说明页面存在
while temp>0:
    page=i

    MainLink="http://news.gzcc.cn/html/xiaoyuanxinwen/"+str(page)+".html"
    if i==1:
        r=requests.get("http://news.gzcc.cn/html/xiaoyuanxinwen/",headers=head)
    else:
        r=requests.get(MainLink,headers=head)
    r.encoding=‘utf-8‘
    soup=BeautifulSoup(r.text,‘html.parser‘)

    #以追加写入方式打开文件
    file_handle=open(‘1.txt‘,mode=‘a‘,encoding=‘utf-8‘)

    listgetListPagel=getListPagel(r,counter)

    #新闻数
    counter=listgetListPagel[0]
    #新闻概述
    if newslist1==None:
        newslist1=listgetListPagel[1]
    else:
        newslist1.extend(listgetListPagel[1])
    #新闻详情
    if newslist2==None:
        newslist2=listgetListPagel[2]
    else:
        newslist2.extend(listgetListPagel[2])

    print(newslist1)

    temp=len(soup.select(".news-list-title"))
    print("已爬取页数"+str(page))
    i=i+1
    print()
    print("----------------------------------------------------------------------------------------------------------------------")
    print()
    file_handle.close()

#安装pandas，用pandas.DataFrame(newstotal)，创建一个DataFrame对象df.
#******************************************************************
df=pandas.DataFrame(newslist1)
print(df)
#通过df将提取的数据保存到csv或excel 文件
df.to_csv("1.csv")

# 提取包含点击次数、标题、来源的前6行数据
print(df[[‘click‘, ‘title‘, ‘sources‘]].head(6))

# 提取‘学校综合办’发布的，‘点击次数’超过3000的新闻。
print(df[(df[‘click‘] > 3000) & (df[‘sources‘] == ‘学校综合办‘)])

# 提取‘国际学院‘和‘学生工作处‘发布的新闻。
print(df[df[‘sources‘].isin([‘国际学院‘, ‘学生工作处‘])])
#******************************************************************

从230页开始爬取：

技术分享图片

由于是从第230页开始爬取，所以并无满足后两个条件的新闻：

技术分享图片

数据结构化与保存

标签：import for 保存文章 global \n ptime data 结构化

原文地址：https://www.cnblogs.com/wban48/p/8858192.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行