标签:ref gecko temp ongl soup 查找 nbsp image sel
http://www.kugou.com/yy/rank/home/1-8888.html
排名
文件&&歌手
时长
效果:
附源码:
import time import json from bs4 import BeautifulSoup import requests class Kugou(object): def __init__(self): self.header = { "User-Agent": ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0‘ } def getInfo(self, url): html = requests.get(url, headers=self.header) soup = BeautifulSoup(html.text, ‘html.parser‘) # print(soup.prettify()) ranks = soup.select(‘.pc_temp_num‘) titles = soup.select(‘.pc_temp_songlist > ul > li > a‘) # 层层标签查找 times = soup.select(‘.pc_temp_time‘) for rank, title, songTime in zip(ranks, titles, times): data = { # rank 全打印就是带HTML标签的 ‘rank‘: rank.get_text().strip(), ‘title‘: title.get_text().split(‘-‘)[1].strip(), ‘singer‘: title.get_text().split(‘-‘)[0].strip(), ‘songTime‘: songTime.get_text().strip() } s = str(data) print(‘rank:%2s\t‘ % data[‘rank‘], ‘title:%2s\t‘ % data[‘title‘], ‘singer:%2s\t‘ %data[‘singer‘], ‘songTime:%2s\t‘ % data[‘songTime‘]) with open(‘hhh.txt‘, ‘a‘, encoding=‘utf8‘) as f: f.writelines(s + ‘\n‘) if __name__ == ‘__main__‘: urls = [ ‘http://www.kugou.com/yy/rank/home/{}-8888.html‘.format(str(i)) for i in range(30) ] kugou = Kugou() for url in urls: kugou.getInfo(url) time.sleep(1)
--------------------------------------------------------------------
urls = [‘http://www.kugou.com/yy/rank/home/{}-8888.html‘.format(str(i)) for i in range(1, 5)]
for i in urls:
print(i)
结果打印:
http://www.kugou.com/yy/rank/home/1-8888.html
http://www.kugou.com/yy/rank/home/2-8888.html
http://www.kugou.com/yy/rank/home/3-8888.html
http://www.kugou.com/yy/rank/home/4-8888.html
--------------------------------------------------------------------
for rank, title, songTime in zip(ranks, titles, times):
data = {
# rank 全打印就是带HTML标签的
‘rank‘: rank.get_text().strip(),
‘title‘: title.get_text().split(‘-‘)[0].strip(),
‘singer‘: title.get_text().split(‘-‘)[1].strip(),
‘songTime‘: songTime.get_text()
}
print(data[‘rank‘])
print(data[‘title‘])
print(data[‘singer‘])
print(data[‘songTime‘])
结果打印:
1
飞驰于你
许嵩
4: 04
--------------------------------------------------------------------
for rank, title, songTime in zip(ranks, titles, times):
data = {
# rank 全打印就是带HTML标签的
‘rank‘: rank,
‘title‘: title,
‘songTime‘: songTime
}
print(data[‘rank‘])
print(data[‘title‘])
print(data[‘songTime‘])
结果打印:
<span class="pc_temp_num">
<strong>1</strong>
</span>
<a class="pc_temp_songname" data-active="playDwn" data-index="0" hidefocus="true" href="http://www.kugou.com/song/pjn5xaa.html" title="许嵩 - 飞驰于你">许嵩 - 飞驰于你</a>
<span class="pc_temp_time"> 4:04 </span>
标签:ref gecko temp ongl soup 查找 nbsp image sel
原文地址:https://www.cnblogs.com/ftl1012/p/9614146.html