标签:turn sub timeout comm cep main div http html_
1 爬虫通用框架 2 import requests 3 4 def get_html_text(url): 5 try: 6 r =requests.get(url,timeout=20) 7 8 r.raise_for_status() 9 10 r.encoding = r.apparent_encoding 11 12 return r.text 13 14 except: 15 return "产生异常" 16 17 if __name__ == ‘__main__‘: 18 url ="http://www.baidu.com" 19 print(get_html_text(url)) 20 21 22 实例 23 import requests 24 from bs4 import BeautifulSoup 25 26 def getHTMLText(url): 27 try: 28 r = requests.get(url,timeout=20) #设置超时 29 r.raise_for_status() # 判断请求是否成功 30 r.encoding = r.apparent_encoding # 设置编码 31 return r.text # 返回获取内容 32 except: #异常处理 33 return "产生异常" 34 35 if __name__ == ‘__main__‘: 36 url = "https://book.douban.com/subject/1084336/comments/" # 需要请求的网址 37 # print(getHTMLText(url)) #调用函数 38 requests = getHTMLText(url) # 获取文本内容 39 soup = BeautifulSoup(requests,"html.parser") # 文本解析 40 reasult = soup.find_all("div",class_="comment") # 文本处理 41 for i in reasult : 42 print(i.p.text) # 循坏打印文本
标签:turn sub timeout comm cep main div http html_
原文地址:https://www.cnblogs.com/mai1994/p/11146063.html