标签:详细 get mozilla html subject size try string 爬取
一、第一版
第一版比较通俗易懂,使用的是urllib里的request + bs4里的BeautifulSoup,requests库可以代替urllib里的request,命令行显示结果。
from urllib import request from urllib import error from bs4 import BeautifulSoup def getHtml(url, ua_agent=‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko‘, num_retries=5): """ 获取url的网页代码,加了一个num_retries参数,默认为5,表示获取如果发生错误,重新执行该函数5次 """ headers = {"User-Agent":ua_agent} req = request.Request(url, headers=headers) html = None try: response = request.urlopen(req) html = response.read().decode(‘utf-8‘) except error.URLError or error.HTTPError as e: if num_retries > 0: if hasattr(e,‘code‘) and 500 <= e.code < 600: getHtml(url, ua_agent, num_retries-1) return html def get_movie_all(html): """ 获取当前页面中所有电影的列表信息 """ soup = BeautifulSoup(html, "html.parser") movie_list = soup.find_all(‘div‘, class_=‘bd doulist-subject‘) return movie_list def get_movie_one(movie_list): """ 获取一部电影的详细信息,拼成一个大的字符串 """ result = "" soup = BeautifulSoup(str(movie_list),"html.parser") title = soup.find_all(‘div‘, class_="title") soup_title = BeautifulSoup(str(title[0]), "html.parser") for line in soup_title.stripped_strings: result += line try: score = soup.find_all(‘span‘, class_=‘rating_nums‘) soup_score = BeautifulSoup(str(score[0]), "html.parser") for line in soup_score.stripped_strings: result += "|| 评分:" result += line except: result += "|| 评分:5.0" abstract = soup.find_all(‘div‘, class_=‘abstract‘) soup_abstract = BeautifulSoup(str(abstract[0]), "html.parser") for line in soup_abstract.stripped_strings: result += "|| " result += line result += ‘\n‘ return result if __name__ == "__main__": for page in range(0, 25, 25): #这里设置就爬取第一页 url = "https://www.douban.com/doulist/3516235/?start={}&sort=seq&playable=0&sub_type=".format(str(page)) htmlInfo = getHtml(url) movie_list = get_movie_all(htmlInfo) for i in movie_list: print(get_movie_one(i))
标签:详细 get mozilla html subject size try string 爬取
原文地址:https://www.cnblogs.com/hooo-1102/p/11791983.html