码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫大作业

时间:2018-04-30 22:16:45      阅读:137      评论:0      收藏:0      [点我收藏+]

标签:jin   lib   code   add   enc   .text   print   eval   dict   

import requests
import re
from bs4 import BeautifulSoup
import json
import urllib
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import xlwt
import jieba.analyse
from PIL import Image,ImageSequence

url=https://juejin.im/search?query=前端
res = requests.get(url)
res.encoding = "utf-8"
soup = BeautifulSoup(res.text,"html.parser")


ajaxUrlBegin=https://search-merger-ms.juejin.im/v1/search?query=%E5%89%8D%E7%AB%AF&page=
ajaxUrlLast=&raw_result=false&src=web
for i in range(0,25):
    ajaxUrl=ajaxUrlBegin+str(i)+ajaxUrlLast;

response=urllib.request.urlopen(ajaxUrl)
ajaxres=response.read().decode(utf-8)
json_str = json.dumps(ajaxres) 
strdata = json.loads(json_str)  
data=eval(strdata) #str转换为dict

for i in range(0,25):
    ajaxUrl = ajaxUrlBegin + str(i) + ajaxUrlLast;
    for i in range(0,19):
        result=[]
        result=data[d][i][title]
        print(result+\n)
        f = open(finally.txt, a, encoding=utf-8)
        f.write(result)
        f.close()

f = open(finally.txt, r, encoding=utf-8)
str = f.read()
stringList = list(jieba.cut(str))
symbol = {"/", "(", ")", " ", "", "", "", "","+","?"," ","","","","","","","","","",""}
stringSet = set(stringList) - symbol
title_dict = {}
for i in stringSet:
    title_dict[i] = stringList.count(i)
print(title_dict)

di = title_dict
wbk = xlwt.Workbook(encoding=utf-8)
sheet = wbk.add_sheet("wordCount")  
k = 0
for i in di.items():
    sheet.write(k, 0, label=i[0])
    sheet.write(k, 1, label=i[1])
    k = k + 1
wbk.save(前端数据.xls)  

font = rC:\Windows\Fonts\simhei.ttf
content =  .join(title_dict.keys())
image = np.array(Image.open(test.jpg))
wordcloud = WordCloud(background_color=white, font_path=font, mask=image, width=1000, height=860, margin=2).generate(content)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
wordcloud.to_file(c-cool.jpg)

生成词云

 

技术分享图片

 

爬虫大作业

标签:jin   lib   code   add   enc   .text   print   eval   dict   

原文地址:https://www.cnblogs.com/OZX143570/p/8974665.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!