#!/usr/bin/python # coding=utf-8 import urllib import urllib2 def loadPage(url,filename): ‘‘‘ 作用:根据URL发送请求,获取服务器响应文件 html:返回的响应文件 filename:处理的文件名 ‘‘‘ print("正在下载" + filename) headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3253.3 Safari/537.36"} request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request).read() return response def writePage(html,filename): ‘‘‘ 作用:将html内容写入到本地 ‘‘‘ print("正在保存" + filename) with open(filename,‘w‘) as f: f.write(html) print("_" * 30) def tiebaSpider(fullurl,beginPage,endPage): ‘‘‘ 贴吧抓取调度器 ,用来组合处理每个页面的URL URL:贴吧URL 的前部分 beginPage:起始页 endPage:结束页 ‘‘‘ for page in range(beginPage,endPage+1): pn = (page - 1) * 50 filename = "第" + str(page) + "页.html" fullurl = url + "&pn=" + str(pn) #print(fullurl) html = loadPage(fullurl,filename) #print(html) writePage(html,filename) print("感谢使用!") if __name__ == "__main__": kw = raw_input("请输入要爬取的贴吧名:") beginPage = int(raw_input("请输入起始页:")) endPage = int(raw_input("请输入结束页:")) url = "http://tieba.baidu.com/f?" key = urllib.urlencode({"kw":kw}) fullurl = url + key tiebaSpider(fullurl,beginPage,endPage)
原文地址:http://bryanguo.blog.51cto.com/3960518/1979178