简单爬虫获取电影资源

时间：2017-04-12 12:37:44 阅读：325 评论：0 收藏：0 [点我收藏+]

标签：list value use charset values 赋值 span www .exe

代码如下:
# -*- coding: utf-8 -*-：
__authoer__ = "wilsoon"


import urllib
import re
import MySQLdb

conn = MySQLdb.connect(host=‘192.168.112.128‘,port=3306,user=‘movie‘,passwd=‘movie‘,db=‘movie‘,charset=‘utf8‘,)
cur = conn.cursor()

def  GetList(pn):
     html = urllib.urlopen(‘http://www.piaohua.com/html/dongzuo/list_%d.html‘ %pn).read()#获取页面电影资源
     reg = r‘<dd><strong><a href="(.*?)">‘#正则匹配某一页中每一部电影的访问地址,即:<dd><strong><a href="/html/dongzuo/2017/0409/31921.html">,具体的电影访问地址用(.*?)表示
     return  re.findall(reg,html)#在html中查找reg中匹配到的电影访问地址以列表的形式返回
‘‘‘

[‘/html/dongzuo/2017/0409/31924.html‘, ‘/html/dongzuo/2017/0409/31921.html‘, ‘/html/dongzuo/2017/0408/31918.html‘, ‘/html/dongzuo/2017/0315/31856.html‘, ‘/html/dongzuo/2017/0320/31873.html‘, ‘/html/dongzuo/2017/0320/31872.html‘, ‘/html/dongzuo/2017/0318/31869.html‘, ‘/html/dongzuo/2017/0221/31788.html‘, ‘/html/dongzuo/2017/0310/31849.html‘, ‘/html/dongzuo/2017/0310/31848.html‘, ‘/html/dongzuo/2017/0306/31833.html‘, ‘/html/dongzuo/2017/0303/31822.html‘, ‘/html/dongzuo/2017/0228/31815.html‘, ‘/html/dongzuo/2017/0215/31773.html‘]

‘‘‘
def GetContent(url):
     #----------title---------------
     html = urllib.urlopen(‘http://www.piaohua.com/%s‘ %url).read()#此处的url为GetList函数返回的列表元素，所以此处的html表示具体某一部电影访问地址
     reg = r‘<h3>(.*?)</h3>‘ #获取电影名字，名字用(.*?)表示，比如<h3>金刚：骷髅岛HD1280高清</h3> 
     title = re.findall(reg,html)[0] #在html中匹配reg电影名字，在某部电影的页面电影名字匹配‘<h3>(.*?)</h3>‘只可能是唯一的，所以在html中找到后列表元素只有一个，故用re.findall(reg,html)[0]获取电影名字，再赋值给title变量

     # ----------content------------------
     reg = r‘下载页面</div>(.*?)<strong><span style="color: #ff0000‘ #获取某一部电影的内容，用‘下载页面</div>(.*?)<strong><span style="color: #ff0000">‘匹配
     reg = re.compile(reg,re.S)#常规下re本身不能匹配换行符（\n）,所以用re.S(大写S)来编译上面查找匹配到的正则表达式，使reg匹配所有内容
     content = re.findall(reg,html)[0] #这里同样获取列表的第一个元素表示内容
     # ---------------下载地址---------------------
     reg = r‘下载页面</div>(.html)‘[0] #匹配电影的下载地址
     reg = r‘line-height: 18px" width="100%"><a href="(.*?)">‘#在电影的具体页面中查找电影的下载地址找到后并赋值给reg
     link = re.findall(reg,html)[0]#同样在含有电影下载地址的列表中获取下载地址（即列表的第一个元素）
     return title,content,link #返回title、content、link

for n in range(1,375):
     for i in GetList(n):#遍历电影访问地址
          title,content,link = GetContent(i)#获取电影的标题、内容、和下载地址
          print ‘正在保存第%d页的  %s ‘  %(n,title)#提示正在插入数据库
          cur.execute("insert into movie(id,title,content,link) VALUES (NULL,‘%s‘, ‘%s‘,‘%s‘)" % (title, content, link))#将查找到的电影名字、内容、下载地址分别存放到数据库中
          conn.commit()#提交插入结果使其生效

简单爬虫获取电影资源

标签：list value use charset values 赋值 span www .exe

原文地址：http://www.cnblogs.com/5icode/p/6698526.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行