标签:info 爬虫 strip 内容 col 爬取 end from baidu
data:image/s3,"s3://crabby-images/6e2c4/6e2c4ed46c1a9225584b85a91041a34e6f2cbb40" alt="技术图片"
data:image/s3,"s3://crabby-images/031da/031da47c7768a4c1b872849e1839f93875d2ff5d" alt="技术图片"
data:image/s3,"s3://crabby-images/89307/893075abcd36f3cd34fad92718441b64d19ed431" alt="技术图片"
1 import requests 2 from bs4 import BeautifulSoup 3 import pandas as pd 4 #获取html网页 5 url = ‘http://top.baidu.com/buzz.php?p=top10&tdsourcetag=s_pctim_aiomsg&qq-pf-to=pcqq.c2c?‘ 6 kv = {‘user-agent‘: ‘Mozilla/5.0‘}#伪装爬虫 7 r = requests.get(url,timeout = 30,headers=kv) 8 r.raise_for_status() 9 r.encoding = r.apparent_encoding 10 r.text#获取源代码 11 html=r.text 12 soup=BeautifulSoup(html,‘html.parser‘) 13 #解析网页,提取内容 14 a = []#创建空列表 15 b = [] 16 for x in soup.find_all(class_="list-title")[:10]: 17 a.append(x.get_text().strip()) 18 for y in soup.find_all(class_="icon-rise")[:10]: 19 b.append(y.get_text().strip()) 20 text =[a,b] 21 print(text) 22 c=pd.DataFrame(text,index=["标题","热度"]) 23 print(c.T)
data:image/s3,"s3://crabby-images/a6062/a6062393eb2b4126789a2163c2991d5a3123f69b" alt="技术图片"
标签:info 爬虫 strip 内容 col 爬取 end from baidu
原文地址:https://www.cnblogs.com/yyy6265/p/12534257.html