爬虫大作业

时间：2018-04-30 18:09:41 阅读：241 评论：0 收藏：0 [点我收藏+]

标签：etl dal class utf8 区块 \n 表达式 sts import

import requests, re, jieba
from bs4 import BeautifulSoup
from datetime import datetime

# 获取新闻细节
def getNewsDetail(newsUrl):
    resd = requests.get(newsUrl)
    resd.encoding = ‘gb2312‘
    soupd = BeautifulSoup(resd.text, ‘html.parser‘)
    content = soupd.select(‘#endText‘)[0].text
    info = soupd.select(‘.post_time_source‘)[0].text
    date = re.search(‘(\d{4}.\d{2}.\d{2}\s\d{2}.\d{2}.\d{2})‘, info).group(1)  # 识别时间格式
    dateTime = datetime.strptime(date, ‘%Y-%m-%d %H:%M:%S‘)  # 用datetime将时间字符串转换为datetime类型
    sources = re.search(‘来源:\s*(.*)‘, info).group(1)
    keyWords = getKeyWords(content)
    print(‘发布时间：{0}\n来源：{1}‘.format(dateTime, sources))
    print(‘关键词：{}、{}、{}‘.format(keyWords[0], keyWords[1], keyWords[2]))
    print(content)
    fo = open("D:\python/test.txt", ‘w‘, encoding=‘utf8‘)
    fo.write(content)
    fo.close()



# 通过jieba分词，获取新闻关键词
def getKeyWords(content):
    content = ‘‘.join(re.findall(‘[\u4e00-\u9fa5]‘, content))  # 通过正则表达式选取中文字符数组，拼接为无标点字符内容
    wordSet = set(jieba._lcut(content))
    wordDict = {}
    for i in wordSet:
        wordDict[i] = content.count(i)
    deleteList, keyWords = [], []
    for i in wordDict.keys():
        if len(i) < 2:
            deleteList.append(i)  # 去掉单字无意义字符
    for i in deleteList:
        del wordDict[i]
    dictList = list(wordDict.items())
    dictList.sort(key=lambda item: item[1], reverse=True)  # 排序，返回前三关键字
    for i in range(3):
        keyWords.append(dictList[i][0])
    return keyWords



# 获取一页的新闻
def getListPage(listUrl):
    res = requests.get(listUrl)
    res.encoding = ‘gbk‘
    soup = BeautifulSoup(res.text, ‘html.parser‘)
    for new in soup.select(‘#news-flow-content‘)[0].select(‘li‘):
        url = new.select(‘a‘)[0][‘href‘]
        title = new.select(‘a‘)[0].text
        print(‘标题：{0}\n链接：{1}‘.format(title, url))
        print(getNewsDetail(url))

        break




listUrl = ‘http://tech.163.com/it/‘
getListPage(listUrl)

因为一直无法下载python中wordcloud的相关文件，于是便使用了网上词云的生成器。网址是：https://timdream.org/wordcloud

技术分享图片

运行效果图：

技术分享图片

此次爬取的是一个关于区块链的新闻内容，通过使用python爬取新闻页面的内容,然后使用网上的词云生成器生成词。

爬虫大作业

标签：etl dal class utf8 区块 \n 表达式 sts import

原文地址：https://www.cnblogs.com/zxc109525/p/8972524.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行