标签:form index lis lib htm 完整 rom item .text
1.选取的是4399小游戏的网址http://www.4399.com/gamehw.htm
2.网络上爬取的相关数据
import
requests
from
bs4
import
BeautifulSoup
def
get(gameurl):
res
=
requests.get(gameurl)
res.encoding
=
‘gb2312‘
soup
=
BeautifulSoup(res.text,
‘html.parser‘
)
tm
=
soup.select(
‘.tm_list‘
)[
0
]
#print(tm)
for
games
in
tm:
try
:
title
=
games.select(
‘a‘
)[
0
].text
print
(title)
except
:
pass
gameurl
=
‘http://www.4399.com/flash/gamehw.htm‘
print
(get(gameurl))
3.进行文本分析
import
requests
from
bs4
import
BeautifulSoup
import
jieba
def
get(gameurl,txt):
res
=
requests.get(gameurl)
res.encoding
=
‘gb2312‘
soup
=
BeautifulSoup(res.text,
‘html.parser‘
)
tm
=
soup.select(
‘.tm_list‘
)[
0
]
#print(tm)
for
games
in
tm:
try
:
title
=
games.select(
‘a‘
)[
0
].text
txt
=
txt
+
title
#print(title)
except
:
pass
words
=
jieba.lcut(txt)
ls
=
[]
counts
=
{}
for
word
in
words:
ls.append(word)
if
len
(word)
=
=
1
:
continue
else
:
counts[word]
=
counts.get(word,
0
)
+
1
items
=
list
(counts.items())
items.sort(key
=
lambda
x:x[
1
], reverse
=
True
)
for
i
in
range
(
25
):
word , count
=
items[i]
print
(
"{:<5}{:>5}"
.
format
(word,count))
from
wordcloud
import
WordCloud
import
matplotlib.pyplot as plt
w
=
" "
.join(words)
wc
=
WordCloud().generate(w)
plt.imshow(wc)
plt.axis(
"off"
)
plt.show()
gameurl
=
‘http://www.4399.com/flash/gamehw.htm‘
txt
=
‘‘
print
(get(gameurl,txt))
生成词云如下
标签:form index lis lib htm 完整 rom item .text
原文地址:http://www.cnblogs.com/0042ljc/p/7766860.html