标签:orm 新闻 coding 工作 字典 行数据 一个 end news
# -*- coding: utf-8 -*-
import
requests
import
re
import
pandas
from
bs4
import
BeautifulSoup
from
datetime
import
datetime
def
getPageN(pageUrl):
res1
=
requests.get(pageUrl)
res1.encoding
=
"utf-8"
soup
=
BeautifulSoup(res1.text,
"html.parser"
)
page
=
soup.select(
‘#pages .a1‘
)[
0
].text.strip(
‘条‘
)
n
=
int
(
int
(page)
/
10
)
return
n
# 1. 将新闻的正文内容保存到文本文件。
def
writeNewsDetail(content):
f
=
open
(
‘gzcc.txt‘
,
‘a‘
,encoding
=
‘utf-8‘
)
f.write(content)
f.close()
# 8. 将获取新闻详情的代码定义成一个函数 def getNewDetail(newsUrl):
def
getNewDetail(newsUrl):
res1
=
requests.get(newsUrl)
res1.encoding
=
"utf-8"
soup
=
BeautifulSoup(res1.text,
"html.parser"
)
news
=
{}
news[
‘title‘
]
=
soup.select(
‘.show-title‘
)[
0
].text
# print(news[‘title‘])
info
=
soup.select(
‘.show-info‘
)[
0
].text
dt
=
info.lstrip(
‘发布时间:‘
)[:
19
]
# str = ‘2018-03-30 17:10:12‘
news[
‘datetimes‘
]
=
str
(datetime.strptime(dt,
‘%Y-%m-%d %H:%M:%S‘
))
newsId
=
re.search(
‘\_(.*).html‘
, newsUrl).group(
1
).split(
‘/‘
)[
-
1
]
clickUrl
=
‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘
.
format
(newsId)
resc
=
requests.get(clickUrl)
news[
‘click‘
]
=
int
(resc.text.split(
‘.html‘
)[
-
1
].lstrip(
"(‘"
).rstrip(
"‘);"
))
source
=
info.find(
‘来源:‘
)
if
source >
0
:
news[
‘sources‘
]
=
info[info.find(
‘来源:‘
):].split()[
0
].lstrip(
‘来源:‘
)
# print("来源:",news[‘sources‘])
author
=
info.find(
‘作者:‘
)
if
author >
0
:
news[
‘authors‘
]
=
info[info.find(
‘作者:‘
):].split()[
0
].lstrip(
‘作者:‘
)
# print(‘作者:‘,news[‘authors‘])
y
=
info.find(
‘摄影:‘
)
if
y >
0
:
news[
‘camera‘
]
=
info[info.find(
‘摄影:‘
):].split()[
0
].lstrip(
‘摄影:‘
)
# print(‘摄影:‘, news[‘u‘])
news[
‘newsUrl‘
]
=
newsUrl
news[
‘content‘
]
=
soup.select(
‘#content‘
)[
0
].text.strip()
writeNewsDetail(news[
‘content‘
])
return
(news)
def
getListPage(pageUrl):
res
=
requests.get(pageUrl)
res.encoding
=
‘utf-8‘
soup
=
BeautifulSoup(res.text,
‘html.parser‘
)
newsList
=
[]
for
news
in
soup.select(
‘li‘
):
if
len
(news.select(
‘.news-list-title‘
)) >
0
:
newsUrl
=
news.select(
‘a‘
)[
0
].attrs[
‘href‘
]
newsList.append(getNewDetail(newsUrl))
# print(newsList)
return
newsList
newsTotal
=
[]
pageUrl
=
‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘
getListPage(pageUrl)
# 2. 将新闻数据结构化为字典的列表:
newsTotal.extend(getListPage(pageUrl))
n
=
getPageN(pageUrl)
for
i
in
range
(
2
,
3
):
listPageUrl
=
‘http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html‘
.
format
(i)
newsTotal.extend(getListPage(listPageUrl))
# 3. 安装pandas,用pandas.DataFrame(newstotal),创建一个DataFrame对象df.
df
=
pandas.DataFrame(newsTotal)
# 5. 用pandas提供的函数和方法进行数据分析:
# 提取包含点击次数、标题、来源的前6行数据
print
(df[[
‘click‘
,
‘title‘
,
‘sources‘
]].head(
6
))
# 提取‘学校综合办’发布的,‘点击次数’超过3000的新闻。
print
(df[(df[
‘click‘
]>
3000
)&(df[
‘sources‘
]
=
=
‘学校综合办‘
)])
# 提取‘国际学院‘和‘学生工作处‘发布的新闻。
sou
=
[
‘国际学院‘
,
‘学生工作处‘
]
print
(df[df[
‘sources‘
].isin(sou)])
# 4. 通过df将提取的数据保存到csv或excel 文件。
df.to_excel(
‘gzcc456.xlsx‘
)
标签:orm 新闻 coding 工作 字典 行数据 一个 end news
原文地址:https://www.cnblogs.com/lzhshuai/p/8855161.html