标签:ror 编码方式 UI 异常 for ret 爬虫 x64 html
2017-07-29 23:20:24
主要技术路线:requests+bs4+格式化输出
import requests from bs4 import BeautifulSoup url = ‘http://www.zuihaodaxue.com/zuihaodaxuepaiming2017.html‘ def gethtml(url): # 打开网页有风险,需要使用try-except语句进行风险控制 kv = {‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0‘} try: r = requests.get(url,headers=kv) r.raise_for_status() # 如果打开失败,则会抛出一个HttpError异常 # encoding是从header中分析出来的编码方式,apparent_encoding是 从内容分析出的编码方式 r.encoding=r.apparent_encoding return r except: print("打开失败") return -1 def gettext(r): soup = BeautifulSoup(r.text,‘html.parser‘) #print(soup.prettify()) tr = soup(‘tr‘) ls = list() lst = list() for i in range(4): th = tr[0](‘th‘) lst.append(th[i].string) ls.append(lst) for i in range(1,len(tr)): td = tr[i](‘td‘) lst=list() lst.append(i) for k in range(1,4): lst.append(td[k].string) ls.append(lst) return ls def printtext(ls): for i in ls: print(‘{0:^10}\t{1:{3}^10}\t{2:^10}‘.format(i[0],i[1],i[2],chr(12288))) if __name__ ==‘__main__‘: r = gethtml(url) ls = gettext(r) printtext(ls)
标签:ror 编码方式 UI 异常 for ret 爬虫 x64 html
原文地址:http://www.cnblogs.com/TIMHY/p/7257915.html