标签:range title ret 技术分享 decode 表示 blog imp ima
# 锁薇,捧腹网 import urllib.request, re# , sys # 获取网页源码 def page(pg): url = ‘https://www.pengfu.com/index_%s.html‘%pg # 页面是utf8编码,所有解码成unicode html = urllib.request.urlopen(url).read().decode(‘utf8‘) # <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> # print(html) return html # 获取标题 def title(html): reg = re.compile(r‘<h1 class="dp-b"><a href=".*?" target="_blank">(.*?)</a>‘) # r表示防止转义 item = re.findall(reg, html) # print(item) return item # 获取图片url def content(html): # html = page(1) reg = r‘<img src="(.*?)" width=‘ item = re.findall(reg, html) # print(item) return item def download(url, name): path = ‘image\%s.jpg‘%name#.decode(‘utf-8‘).encode(‘gbk‘) # win下只识别gbk urllib.request.urlretrieve(url, path) for i in range(5,9): html = page(i) title_list = title(html) content_list = content(html) for m, n in zip(title_list, content_list): # 把标题和图片对个对应 print(‘正在下载>>>>>:‘ + m, n) download(n, m)
标签:range title ret 技术分享 decode 表示 blog imp ima
原文地址:http://www.cnblogs.com/UncleYong/p/6973887.html