按照大牛的步骤简单地爬了豆瓣电影top250.过程感想记录一下。
教程链接:在这里
爬虫写完以后,一直出现错误
AttributeError: ‘NoneType‘ object has no attribute ‘find‘ #‘nonetype‘的对象没有属性发现
出错原因:调用对象是个None,然后调用出错。出错行原本是在第10页没有下一页的链接时候,判断soup.find(‘span‘, attrs={‘class‘, ‘next‘}).find(‘a‘) 这句话的结果来选择返回结果,来达到优化原则,可是我写了一句额
import requests
from bs4 import BeautifulSoup
def Download_page(url):
date = requests.get(url).content
soup = BeautifulSoup(date, ‘lxml‘)
movie_list_soup = soup.find(‘ol‘,attrs={‘class‘,‘grid_view‘})
movie_name_list = []
for movie_li in movie_list_soup.find_all(‘li‘):
detail = movie_li.find(‘div‘,attrs={‘class‘,‘hd‘})
movie_name = detail.find(‘span‘,attrs={‘class‘,‘title‘})
movie_name_list.append(movie_name.text)
next_page = down_url+soup.find(‘span‘,attrs={‘class‘,‘next‘}).find(‘a‘)[‘href‘] #出错在这里
if next_page:
return movie_name_list,next_page
return movie_name_list,None
down_url = ‘https://movie.douban.com/top250‘
url = down_url
with open(‘G://movie_name_top250.txt‘, ‘w‘) as f:
while url:
movie,url = Download_page(url)
Download_page(url)
f.write(str(movie))
这是教程中给出的,学习一下
#!/usr/bin/env python # encoding=utf-8 """ 爬取豆瓣电影TOP250 - 完整示例代码 """ import codecs import requests from bs4 import BeautifulSoup DOWNLOAD_URL = ‘http://movie.douban.com/top250/‘ def download_page(url): return requests.get(url, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36‘ }).content def parse_html(html): soup = BeautifulSoup(html) movie_list_soup = soup.find(‘ol‘, attrs={‘class‘: ‘grid_view‘}) movie_name_list = [] for movie_li in movie_list_soup.find_all(‘li‘): detail = movie_li.find(‘div‘, attrs={‘class‘: ‘hd‘}) movie_name = detail.find(‘span‘, attrs={‘class‘: ‘title‘}).getText() movie_name_list.append(movie_name) next_page = soup.find(‘span‘, attrs={‘class‘: ‘next‘}).find(‘a‘) if next_page: return movie_name_list, DOWNLOAD_URL + next_page[‘href‘] return movie_name_list, None def main(): url = DOWNLOAD_URL with codecs.open(‘movies‘, ‘wb‘, encoding=‘utf-8‘) as fp: while url: html = download_page(url) movies, url = parse_html(html) fp.write(u‘{movies}\n‘.format(movies=‘\n‘.join(movies))) if __name__ == ‘__main__‘: main()