标签:
1.安装Requests
window:pip install requests
linux:sudo pip install requests
国内安装缓慢,建议到:
http://www.lfd.uci.edu/~gohlke/pythonlibs/
搜索到request并下载
修改后缀名whl为zip并解压,复制requests文件夹到python的lib目录下
2.获取网站内容
import requests useragent = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36‘} html = requests.get("http://tieba.baidu.com/f?ie=utf-8&kw=python",headers=useragent) print(html.text)
3.向网页提交数据
get从服务器获取数据
post向服务器发送数据
get通过构造url中的参数来实现功能
post是将数据放在header中提交数据
在使用ajax加载数据的时候是不会在源码中显示的,这时候就要发送post请求来获取数据
data={ ‘type‘:‘1‘, ‘sort‘:‘1‘, ‘currentPage‘:‘3‘ } html_text = requests.post("http://xxxxxx/student/courses/searchCourses",data=data) print(html_text.text)
---------------------------------------------------------------------------------------------
举个小例子,这是从计科学院的视频上记录下来的笔记
import requests import re # -*- coding: utf-8 -*- class spider(object): def changepage(self,url,total_page): now_page = int(re.search(‘pageNum=(\d+)‘,url,re.S).group(1)) page_group=[] for i in range(now_page,total_page+1): link = re.sub(‘pageNum=\d+‘,‘pageNum=%s‘%i,url,re.S) page_group.append(link) return page_group def getsource(self,url): html = requests.get(url) return html.text def geteveryclass(self,source): everyclass = re.findall(‘(<li id=.*?</li>)‘,source,re.S) return everyclass def getinfo(self,eachclass): info = {} info[‘title‘] = re.search(‘alt="(.*?)"‘,eachclass,re.S).group(1) info[‘content‘] = re.search(‘display: none;">(.*?)</p>‘,eachclass,re.S).group(1) timeandlevel = re.findall(‘<em>(.*?)</em>‘,eachclass,re.S) info[‘classtime‘] = timeandlevel[0] info[‘classlevel‘] = timeandlevel[1] info[‘learnnum‘] = re.search(‘"learn-number">(.*?)</em>‘,eachclass,re.S).group(1) return info def saveinfo(self,classinfo): f=open(‘info.txt‘,‘a‘)#open(路径+文件名,读写模式)r只读,r+读写,w新建(会覆盖原有文件),a追加,b二进制文件.常用模式 for each in classinfo: f.writelines(‘title:‘+each[‘title‘]+‘\n‘) # f.writelines(‘content:‘+each[‘content‘+‘\n‘]) # f.writelines(‘classtime:‘+each[‘classtime‘+‘\n‘]) # f.writelines(‘classlevel:‘+each[‘classlevel‘+‘\n‘]) # f.writelines(‘learnnum:‘+each[‘learnnum‘+‘\n\n‘]) f.close() if __name__ == ‘__main__‘: classinfo = []#定义一个列表,里面将放置所有课程的字典 url = ‘http://www.jikexueyuan.com/course/?pageNum=1‘ jikespider = spider()#实例化 all_links = jikespider.changepage(url,2)#获取20页的url for link in all_links: print(‘读取文件:‘+link) html = jikespider.getsource(link)#获取当前页资源 everyclass = jikespider.geteveryclass(html)#获取当前页所有的li for each in everyclass: info = jikespider.getinfo(each)#分类获取资源 classinfo.append(info)#加入列表 jikespider.saveinfo(classinfo)#写操作
标签:
原文地址:http://www.cnblogs.com/itliucheng/p/5075350.html