标签:[] list tag UI time inf for int timeout
爬取的是 ‘’最好大学网‘’,提取2017年排名前20名大学的名称和分数
1 #coding: utf-8 2 import requests 3 from bs4 import BeautifulSoup 4 import bs4 5 6 def getHTMLText(url): 7 try: 8 r = requests.get(url, timeout=30) 9 r.raise_for_status() 10 r.encoding = r.apparent_encoding 11 return r.text 12 except: 13 return "fail" 14 15 def fillUnivList(ulist, html): 16 soup = BeautifulSoup(html, "html.parser") 17 for tr in soup.find(‘tbody‘).children: 18 if isinstance(tr, bs4.element.Tag): 19 tds = tr(‘td‘) 20 ulist.append([tds[0].string, tds[1].string, tds[3].string]) 21 22 def printUnivList(ulist, num): 23 tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}" 24 for i in range(num): 25 u=ulist[i] 26 print(u[1],u[2]) 27 28 def main(): 29 uinfo = [] 30 url = ‘http://www.zuihaodaxue.cn/zuihaodaxuepaiming2017.html‘ 31 html = getHTMLText(url) 32 fillUnivList(uinfo, html) 33 printUnivList(uinfo, 20) 34 35 main()
结果:
标签:[] list tag UI time inf for int timeout
原文地址:http://www.cnblogs.com/wyfighting/p/7469230.html