数据结构化

时间：2018-04-16 13:00:28 阅读：146 评论：0 收藏：0 [点我收藏+]

标签：orm 新闻 coding 工作字典行数据一个 end news

# -*- coding: utf-8 -*-

import requests

import re

import pandas

from bs4 import BeautifulSoup

from datetime import datetime

def getPageN(pageUrl):

res1 = requests.get(pageUrl)

res1.encoding = "utf-8"

soup = BeautifulSoup(res1.text, "html.parser")

page = soup.select(‘#pages .a1‘)[0].text.strip(‘条‘)

n = int(int(page)/10)

return n

# 1. 将新闻的正文内容保存到文本文件。

def writeNewsDetail(content):

f=open(‘gzcc.txt‘,‘a‘,encoding=‘utf-8‘)

f.write(content)

f.close()

# 8. 将获取新闻详情的代码定义成一个函数 def getNewDetail(newsUrl):

def getNewDetail(newsUrl):

res1 = requests.get(newsUrl)

res1.encoding = "utf-8"

soup = BeautifulSoup(res1.text, "html.parser")

news={}

news[‘title‘] = soup.select(‘.show-title‘)[0].text

# print(news[‘title‘])

info = soup.select(‘.show-info‘)[0].text

dt = info.lstrip(‘发布时间:‘)[:19]

# str = ‘2018-03-30 17:10:12‘

news[‘datetimes‘] = str(datetime.strptime(dt, ‘%Y-%m-%d %H:%M:%S‘))

newsId = re.search(‘\_(.*).html‘, newsUrl).group(1).split(‘/‘)[-1]

clickUrl = ‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘.format(newsId)

resc = requests.get(clickUrl)

news[‘click‘] = int(resc.text.split(‘.html‘)[-1].lstrip("(‘").rstrip("‘);"))

source = info.find(‘来源：‘)

if source > 0:

news[‘sources‘] = info[info.find(‘来源：‘):].split()[0].lstrip(‘来源：‘)

# print("来源：",news[‘sources‘])

author = info.find(‘作者：‘)

if author > 0:

news[‘authors‘] = info[info.find(‘作者：‘):].split()[0].lstrip(‘作者：‘)

# print(‘作者：‘,news[‘authors‘])

y = info.find(‘摄影：‘)

if y > 0:

news[‘camera‘] = info[info.find(‘摄影：‘):].split()[0].lstrip(‘摄影：‘)

# print(‘摄影：‘, news[‘u‘])

news[‘newsUrl‘] = newsUrl

news[‘content‘] = soup.select(‘#content‘)[0].text.strip()

writeNewsDetail(news[‘content‘])

return(news)

def getListPage(pageUrl):

res = requests.get(pageUrl)

res.encoding = ‘utf-8‘

soup = BeautifulSoup(res.text, ‘html.parser‘)

newsList = []

for news in soup.select(‘li‘):

if len(news.select(‘.news-list-title‘)) > 0:

newsUrl = news.select(‘a‘)[0].attrs[‘href‘]

newsList.append(getNewDetail(newsUrl))

# print(newsList)

return newsList

newsTotal = []

pageUrl = ‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘

getListPage(pageUrl)

# 2. 将新闻数据结构化为字典的列表:

newsTotal.extend(getListPage(pageUrl))

n = getPageN(pageUrl)

for i in range(2,3):

listPageUrl = ‘http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html‘.format(i)

newsTotal.extend(getListPage(listPageUrl))

# 3. 安装pandas，用pandas.DataFrame(newstotal)，创建一个DataFrame对象df.

df = pandas.DataFrame(newsTotal)

# 5. 用pandas提供的函数和方法进行数据分析：

# 提取包含点击次数、标题、来源的前6行数据

print(df[[‘click‘,‘title‘,‘sources‘]].head(6))

# 提取‘学校综合办’发布的，‘点击次数’超过3000的新闻。

print(df[(df[‘click‘]>3000)&(df[‘sources‘]==‘学校综合办‘)])

# 提取‘国际学院‘和‘学生工作处‘发布的新闻。

sou = [‘国际学院‘,‘学生工作处‘]

print(df[df[‘sources‘].isin(sou)])

# 4. 通过df将提取的数据保存到csv或excel 文件。

df.to_excel(‘gzcc456.xlsx‘)

数据结构化

标签：orm 新闻 coding 工作字典行数据一个 end news

原文地址：https://www.cnblogs.com/lzhshuai/p/8855161.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行