码迷,mamicode.com
首页 > 其他好文 > 详细

20170513爬取猫眼电影Top100

时间:2017-05-13 20:01:51      阅读:332      评论:0      收藏:0      [点我收藏+]

标签:ascii   main   status   int   out   boa   top100   str   yield   

import json
import re
import requests
from bs4 import BeautifulSoup
from requests import RequestException
from multiprocessing import Pool
def get_one_page(url):
headers = {‘User-Agent‘:‘baiduspider+‘}
try:
response = requests.get(url,headers=headers,timeout = 5)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
#pattern = re.compile(‘<dd>.*?board-index.*?>(\d+)</i>.*?src="(.*?)".*?name"><a.*?(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)<i>.*?fraction">(.*?)</i>.*?</dd>‘,re.S)
#items = re.findall(pattern,html)
for item in BeautifulSoup(html,‘lxml‘).find_all(‘dd‘):
rank = item.select(‘i‘)[0].text
name = item.select(‘p > a‘)[0].text
star = item.select(‘.star‘)[0].text.strip()
releasetime = item.select(‘.releasetime‘)[0].text
integer = item.select(‘.integer‘)[0].text
fraction = item.select(‘.fraction‘)[0].text
grade = integer+fraction
yield {
‘rank‘:rank,
‘name‘:name,
‘star‘:star,
‘releasetime‘:releasetime,
‘grade‘:grade
}
#print(rank,name,star,releasetime,‘评分为:‘,integer+fraction)
#return rank,name,star,releasetime,‘评分为:‘,integer+fraction
def write_to_file(content):
with open(‘result.txt‘,‘a‘,encoding=‘utf-8‘) as f:
f.write(json.dumps(content,ensure_ascii=False) + ‘\n‘)#将字典转化为字符串
f.close()
def main(offset):
url = ‘http://maoyan.com/board/4?offset=‘ + str(offset)
html = get_one_page(url)
parse_one_page(html)
for item in parse_one_page(html):
print(item)
write_to_file(item)

if __name__=="__main__":
for i in range(10):
main(i*10)
#pool = Pool()
#pool.map(main,[i*10 for i in range(10)])

20170513爬取猫眼电影Top100

标签:ascii   main   status   int   out   boa   top100   str   yield   

原文地址:http://www.cnblogs.com/Jiang190/p/6849845.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!