词频统计（网易新闻）

时间：2019-07-23 16:42:50 阅读：218 评论：0 收藏：0 [点我收藏+]

标签：class rate pytho 针对 coding font https key http

词频统计（网易新闻）

import os
import re
import jieba
import requests

if not os.path.exists('网易新闻'):
    os.mkdir('网易新闻')

count = 0
str_ = ''

for i in ['nba']:
    # 获取所有的url
    response = requests.get(f'https://sports.163.com/{i}/')
    data = response.text
    url_res = re.findall('href="(https://sports.163.com/.*?)"', data)
    url_res = set(url_res)

    # 针对单个url

    for url in url_res:
        url_response = requests.get(url)
        url_data = url_response.text

        try:
            title = re.findall('<h1>(.*?)</h1>', url_data, re.S)[0]
            news_res =                 re.findall(
                    '<div class="post_text" id="endText" style="border-top:1px solid #ddd;">(.*?责任编辑：.*?)</span>',
                    url_data, re.S)[0]  #
            news_res = re.sub('<.*?>', '', news_res)
        except:
            continue

        title = re.sub('[!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~，…]|\s', '', title)  # 除掉标题所有的脏字符
        title_path = os.path.join('网易新闻', f'{title}.txt')  # 拼接出新闻的路径
        # f = open(title_path, 'w', encoding='utf8')
        # f.write(news_res)
        # f.flush()
        # f.close()
        count += 1

        str_ += news_res

        print(f'完成{count}篇, {title} done...')

res = jieba.lcut(str_)
dic = {}
for i in res:
    if len(i) == 1:
        continue
    if i not in dic:
        dic[i] = 1
    else:
        dic[i] += 1

dic_list = list(dic.items())

def func(i):
    return i[1]

dic_list.sort(key = func)
dic_list.reverse()

new_str = ''
for i in dic_list[:20]:
    new_str += f'{i[0]},'
    print(i)

import wordcloud

w = wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\等线\Deng')
w.generate(new_str)
w.to_file('网易新闻.png')

词频统计（网易新闻）

标签：class rate pytho 针对 coding font https key http

原文地址：https://www.cnblogs.com/yushan1/p/11232397.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行