码迷,mamicode.com
首页 > 编程语言 > 详细

Python爬取豆瓣高分电影前250名

时间:2019-03-02 23:52:43      阅读:358      评论:0      收藏:0      [点我收藏+]

标签:src   add   exec   name   string   tar   sheet   1.5   filter   

import requests import pymysql import time import re import xlwt from lxml import etree headers = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36‘, ‘Cookie‘: ‘gr_user_id = c6f58a39 - ea25 - 4f58 - b448 - 545070192c4e;59a81cc7d8c04307ba183d331c373ef6_gr_session_id = e8e4b66f - 440a - 4ae7 - a76a - fe2dd2b34a26;59a81cc7d8c04307ba183d331c373ef6_gr_last_sent_sid_with_cs1 = e8e4b66f - 440a - 4ae7 - a76a - fe2dd2b34a26;59a81cc7d8c04307ba183d331c373ef6_gr_last_sent_cs1 = N % 2FA;59a81cc7d8c04307ba183d331c373ef6_gr_session_id_e8e4b66f - 440a - 4ae7 - a76a - fe2dd2b34a26 = true;grwng_uid = 9ec14ad9 - 5ac0 - 4bb1 - 81c1 - bc60d2685710;abtest_ABTest4SearchDate = b;xzuuid = 79426b52;_uab_collina = 154660443606130958890473;TY_SESSION_ID = 907f32df - c060 - 49ca - b945 - 98215cc03475;rule_math = pvzq3r06hi‘} conn = pymysql.connect(host= ‘localhost‘,user= ‘root‘,passwd=‘momiao5201314‘,db=‘doubanmovie‘,port=3306,charset=‘utf8‘) cursor = conn.cursor() #创建光标对象 ‘‘‘ # 创建一个workbook设置编码 workbook = xlwt.Workbook(encoding = ‘utf-8‘) # 创建一个worksheet worksheet = workbook.add_sheet(‘My Worksheet‘) #定义表头 header = [‘movie_name‘,‘director‘,‘actors,style‘,‘country‘,‘release_time‘,‘time‘,‘score‘] for h in range(len(header)): workbook.write(0,h,header[h]) ‘‘‘ def get_movie_url(url): html = requests.get(url,headers=headers) selector = etree.HTML(html.text) movie_urls = selector.xpath(‘//div[@class="hd"]/a/@href‘) for movie_url in movie_urls: #print(movie_url) get_movie_info(movie_url) def get_movie_info(url): html = requests.get(url,headers=headers) selector = etree.HTML(html.text) try: movie_name = selector.xpath(‘//*[@id="content"]/h1/span[1]/text()‘) #1电影名称 #print(movie_name) director = selector.xpath(‘//*[@id="info"]/span[1]/span[2]/a/text()‘) #2导演 #print(director) actors = selector.xpath(‘//*[@id="info"]/span[3]/span[2]‘)[0] #Xpath疑问? actor = actors.xpath(‘string(.)‘) #3演员 #print(actor) style = re.findall(‘<span property="v:genre">(.*?)</span>‘,html.text,re.S)[0] + re.findall(‘<span property="v:genre">(.*?)</span>‘,html.text,re.S)[1] #4类型 #print(style) country = re.findall(‘<span class="pl">制片国家/地区:</span>(.*?)<br/>‘,html.text,re.S) #5制片地区 #print(country) release_time = re.findall(‘上映日期:</span>.*?>(.*?)</span>‘,html.text,re.S) #6上映时间 #print(release_time) time = re.findall(‘<span class="pl">片长:</span>.*?>(.*?)</span>‘,html.text,re.S) #7片长 #print(time) score = selector.xpath(‘//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()‘) #8评分 #print(score) print(str(movie_name)) #sql = ‘insert into doubanmovie(name,director,actor,style,country,release_time,time,score,) values("{}","{}","{}","{}","{}","{}","{}","{}")‘.format(movie_name,director,actor,style,country,release_time,time,score) #多一个逗号 cursor.execute("insert into doubanmovie(name,director,actor,style,country,release_time,time,score) values(%s,%s,%s,%s,%s,%s,%s,%s)",(str(movie_name),str(director),str(actor),str(style),str(country),str(release_time),str(time),str(score))) except IndexError: pass if __name__ == ‘__main__‘: urls = [‘https://movie.douban.com/top250?start={}&filter=‘.format(num)for num in range(0,250,25)] for url in urls: get_movie_url(url) time.sleep(2) conn.commit()

技术图片

Python爬取豆瓣高分电影前250名

标签:src   add   exec   name   string   tar   sheet   1.5   filter   

原文地址:https://blog.51cto.com/12884584/2357199

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!