码迷,mamicode.com
首页 > 数据库 > 详细

IMDB TOP 250爬虫

时间:2017-09-18 22:28:15      阅读:290      评论:0      收藏:0      [点我收藏+]

标签:time   com   mozilla   core   print   作业   enc   sts   ***   

这个小学期Python大作业搞了个获取IMDB TOP 250电影全部信息的爬虫。第二次写爬虫,比在暑假集训时写的熟练多了。欢迎大家评论。

  1 ‘‘‘
  2 ************************************************
  3 *Made by 1120162015 李博       
  4 *        1120161966 张嘉熙     
  5 *Time:2017.9.11       
  6 *Target:All movies‘ information of IMDB TOP_250
  7 *Resources:http://www.imdb.cn/IMDB250/
  8 *纯原创 转载请注明作者:李博,张嘉熙
  9 ************************************************
 10 ‘‘‘
 11 
 12 import re
 13 import requests
 14 import numpy as np
 15 import matplotlib.pyplot as plt
 16 from bs4 import BeautifulSoup
 17 
 18 num = 1 #电影计数
 19 All_txt = [] #全部电影的信息
 20 headers={User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0}#浏览器代理
 21 def  getHTMLText(url):
 22     try:
 23         #print(url)
 24         r = requests.get( url,headers = headers )
 25         #print(r)
 26         r.encoding = utf-8
 27         return r.text
 28     except:
 29         return "错误"
 30 
 31 #从每一部电影的页面中获取全部信息
 32 def get_all_information(url,page):
 33     global num,All_txt
 34     txt = getHTMLText(url)
 35     if txt != "错误":
 36         print(page+str(page)+ NO.+str(num)+ Get it!)
 37     if num == 247:
 38         print(Finished!!!)
 39     soup = BeautifulSoup(txt,"html.parser")
 40     Cname,Ename,Score,title,Actor,Starring,Infor = ‘‘,‘‘,‘‘,‘‘,‘‘,‘‘,‘‘
 41 
 42     #TOP250-film_Chinese_name&Score
 43     infor_1 = soup.find_all(div,class_ = hdd)
 44     rel = <h3>+[\s\S]*?+</h3>
 45     pattern = re.compile(rel)
 46     Cname = ‘‘.join(pattern.findall(str(infor_1[0])))
 47     Cname = Cname.replace(<h3>,‘‘).replace(</h3>,‘‘)
 48     #print(Cname)
 49     #find_the_year & save
 50     rel = +[\s\S]*?+
 51     pattern = re.compile(rel)
 52     time_ = ‘‘.join(pattern.findall(Cname))
 53     #print(time_)
 54     with open(time.txt,a,encoding=utf-8) as t:
 55         t.write( time_.replace(,‘‘).replace(,‘‘) + \n )
 56     #find_Score
 57     rel = <i>+[\s\S]*?+</i>
 58     pattern = re.compile(rel)
 59     Score = ‘‘.join(pattern.findall(str(infor_1[0])))
 60     Score = Score.replace(<i>,‘‘).replace(</i>,‘‘)
 61     #print(Cname,Score)
 62 
 63     #TOP250-film_many_infor
 64     now = soup.find_all(div,class_ = bdd clear)
 65     #print(now[0])
 66     a = BeautifulSoup(str(now[0]), "html.parser")
 67     many_infor = a.find_all(li)
 68 
 69     #TOP250-film_Ename
 70     Ename = str(many_infor[0]).replace(<li>,‘‘).replace(<i>,‘‘).replace(</i>,‘‘).replace(</li>,‘‘).replace(<a>,‘‘).replace(</a>,‘‘)
 71     #TOP250-film_Actor
 72     Actor_temp = BeautifulSoup(str(many_infor[2]), "html.parser").find_all(a)
 73     Actor = Actor_temp[0].get_text().replace(导演:,‘‘)
 74     #TOP250-film_Starring
 75     Starring_temp = BeautifulSoup(str(many_infor[3]), "html.parser").find_all(a)
 76     for i in Starring_temp:
 77         Starring += i.get_text().replace( ,‘‘) +  
 78     #print(Starring)
 79 
 80     #Top-film_Infor
 81     for j in range(4,7):
 82         Infor_temp = BeautifulSoup(str(many_infor[j]), "html.parser")
 83         for i in Infor_temp.children:
 84             Infor += i.get_text().replace( ,‘‘) +  
 85         Infor += \n
 86     #print(Infor)
 87 
 88     #TOP250-film_Synopsis
 89     content =  soup.find_all(div,class_ = fk-4 clear)
 90     #print(content)
 91     soup_con = BeautifulSoup(str(content[0]), "html.parser")
 92     title = soup_con.find_all(div,class_ = hdd)
 93     title = str(title[0]).replace(<div class="hdd">,‘‘).replace(</div>,\n)
 94     #print(title)
 95     content_1 = soup_con.find_all(div,class_ = bdd clear)
 96     content_1 = str(content_1[0]).replace(<div class="bdd clear" style="font-size:15px">,‘‘).replace(</div>,‘‘)
 97     content_1 = content_1.replace(<!-- <p><a href="#">更多剧情 >></a></p>  -->,‘‘).replace(<br/>,\n)
 98 
 99     #Save_all_information
100     All_txt.append(+str(num)++\n)
101     All_txt.append( Cname+\n )
102     All_txt.append( 【英文名】+Ename+\n )
103     All_txt.append( 【评分】+Score+\n )
104     All_txt.append( 【导演】+Actor+\n )
105     All_txt.append( 【主演】+Starring+\n )
106     All_txt.append( Infor+\n )
107     All_txt.append( title+\n+content_1+\n )
108     All_txt.append(\n)
109     num += 1
110 
111 #在每一页中得到当前页的全部电影的url
112 def getin_one(url,page):
113     txt = getHTMLText(url)
114     soup = BeautifulSoup(txt, "html.parser")
115     #print(soup)
116     temp = soup.find_all(div,class_="ss-3 clear")
117     rel = <a href=" + [\s\S]*? + ">
118     pattern = re.compile(rel)
119     All_url = pattern.findall( str(temp[0]) )
120     for i in range(len(All_url)):
121         temp_url = http://www.imdb.cn+All_url[i].replace(<a href=",‘‘).replace(">,‘‘)
122         get_all_information(temp_url,page)
123     #print(All_url)
124 
125 #将所有电影的年份统计并生成条形图
126 def Analyze_some_infor():
127     plt.rc(font, family=SimHei, size=13)#字体及大小
128     #Analyze_time
129     file = open(time.txt)
130     a,b,c,d,e,f = 0,0,0,0,0,0
131     for line in file:
132         line = eval(line)
133         if line == 0:
134             f += 1
135         elif line < 1940 and line >= 1920:
136             a += 1 
137         elif line < 1960 and line >= 1940:
138             b += 1
139         elif line < 1980 and line >= 1960:
140             c += 1
141         elif line < 2000 and line >= 1980:
142             d += 1
143         else:
144             e += 1
145     times = [a,b,c,d,e,f]
146     range_time = [1920-1940,1940-1960,1960-1980,1980-2000,2000-现在,无信息]
147     idx = np.arange(len(range_time))
148     width = 0.5
149     plt.bar(idx,times,width,color=green)
150     plt.xticks(idx+width/2, range_time, rotation=40)
151     plt.xlabel(电影年代)
152     plt.ylabel(数目)
153     plt.savefig(time_pic.jpg)
154     plt.show()
155 
156 def main():
157     global All_txt
158     getin_one(http://www.imdb.cn/IMDB250/,1)
159     for i in range(2,10):
160         getin_one( http://www.imdb.cn/imdb250/+str(i) , i )
161     #将已有内容清空
162     with open(All_infor.txt,w,encoding=utf-8) as x:
163         pass
164     with open(All_infor.txt,a,encoding=utf-8) as x:
165         for i in All_txt:
166             x.write(i)
167     Analyze_some_infor()
168 
169 main()

作者: LB919
出处:http://www.cnblogs.com/L1B0/
该文章为LB919投入了时间和精力的原创;
如有转载,荣幸之至!请随手标明出处;

IMDB TOP 250爬虫

标签:time   com   mozilla   core   print   作业   enc   sts   ***   

原文地址:http://www.cnblogs.com/L1B0/p/7545073.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!