标签:tip cep apple bcb osi member return ext update
1、获取网站课程的分类地址;
‘‘‘ 爬取屌丝首页,获取每个分类名称和链接 ‘‘‘ import requests from lxml import etree headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36‘, } def get_class_data(): list_data = [] url = ‘http://www.diaosiweb.net/index.html‘ responese = requests.get(url,headers=headers) responese.encoding = responese.apparent_encoding class_names = etree.HTML(responese.text).xpath(‘//div[@id="menu"]/div/ul/li/a/text()‘) class_links = etree.HTML(responese.text).xpath(‘//div[@id="menu"]/div/ul/li/a/@href‘) for class_name,class_link in zip(class_names,class_links): if len(class_link.split(‘/‘)[-1]) == 0: class_data = { ‘类别名称‘:class_name, ‘类别链接‘:class_link, } list_data.append(class_data) else: pass return list_data
2、通过上面获取的地址来获取所有的每个分类下的所有课程名称、链接和发布时间,并保存到Mongodb中去;
‘‘‘ 获取每个分类url下面的课程名称和链接,然后通过课程链接,进入到链接里面去获取每个课程的url和密码 ‘‘‘ from spiders_diaosi import get_class_data import requests from lxml import etree import pymongo from multiprocessing import Pool headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36‘, } client = pymongo.MongoClient(‘localhost‘,27017) diaosi = client[‘kecheng_message‘] kecheng_message = diaosi[‘message‘] def get_kecheng_data(url): #获取每页的课程名称、链接、发布时间 try: response = requests.get(url,headers = headers) response.encoding = response.apparent_encoding kecheng_names = etree.HTML(response.text).xpath(‘//ul[@class="g-list1"]/li/a/text()‘) kecheng_links = etree.HTML(response.text).xpath(‘//ul[@class="g-list1"]/li/a/@href‘) times = etree.HTML(response.text).xpath(‘//ul[@class="g-list1"]/li/span/text()‘) for kecheng_name,kecheng_link,time in zip(kecheng_names,kecheng_links,times): data = { ‘课程名称‘:kecheng_name, ‘课程链接‘:kecheng_link, ‘发布时间‘:time } kecheng_message.insert(data) #把获取到的课程信息保存到Mongodb中,最后爬取的时候从数据中爬取 #print(data) except Exception as e: print(e) def get_max_page(url): #获取每个分类的最大页数 page_response = requests.get(url,headers=headers) page_num = int(etree.HTML(page_response.text).xpath(‘//span[@class="pageinfo"]/strong[1]/text()‘)[0]) return page_num #print(page_num) def get_class_id(url): class_response = requests.get(url,headers=headers) class_response.encoding = class_response.apparent_encoding if get_max_page(url) != 1: class_id = int(etree.HTML(class_response.text).xpath(‘//ul[@class="pagelist"]/li/a/@href‘)[-1].split(‘_‘)[1]) for num in range(1,get_max_page(url) + 1): new_url = ‘{}list_{}_{}.html‘.format(url,class_id,num) #print(new_url) get_kecheng_data(new_url) else: get_kecheng_data(url) for link in get_class_data(): #从之前的爬取的分类链接中,读取其中的链接,然后爬取每个分类链接中的课程信息 url = link[‘类别链接‘] print(‘开始爬取:‘ + link[‘类别名称‘]) get_class_id(url) print(‘已经爬完了:‘ + link[‘类别名称‘])
3、从数据库中读取每个课程的链接,因为下载地址只有登入之后才可以看到,所以模拟登入之后,进行获取,并保存到Mongodb中去,
from get_captcha import get_capthca import pymongo import re import requests from lxml import etree import random client = pymongo.MongoClient(‘localhost‘,27017) diaosi = client[‘kecheng_message‘] kecheng_message = diaosi[‘message‘] dow_message = diaosi[‘dow_message‘] login_url = ‘http://www.diaosiweb.net/member/index.php‘ headers_data = [ ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36‘, ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393‘, ‘Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0‘, ] headers = {‘User-Agent‘:random.choice(headers_data)} data = { ‘fmdo‘:‘login‘, ‘dopost‘:‘login‘, ‘gourl‘:‘‘, ‘userid‘:‘***‘, #运行的时候这里输入你的用户名,或者用input函数输入也可以 ‘pwd‘:‘****‘, #这里则输入密码,获取用input函数 ‘vdcode‘:‘‘, ‘keeptime‘:‘604800‘, } get_capthca(login_url) captcha = input(‘输入你看到的验证码:‘) data[‘vdcode‘] = captcha session = requests.Session() session.headers.update(headers) login_response = session.get(login_url,headers= headers,data=data) for link in kecheng_message.find(): html = session.get(link[‘课程链接‘]) html.encoding = html.apparent_encoding dow_url = re.compile("<div id=‘pan‘ style=\"display:none;\">(.*?)</div>").findall(html.text)[0] mima = etree.HTML(html.text).xpath(‘//span[@style]/text()‘) data = { ‘name‘:link[‘课程名称‘], ‘link‘:link[‘课程链接‘], ‘dow_url‘:dow_url, } try: if len(mima) == 0 or len(mima) > 5 and ‘网盘提取密码‘ not in mima[-1].split(‘:‘) : data[‘mima‘] = ‘没有密码‘ else: data[‘mima‘] = mima dow_message.insert(data) print(data) except Exception as e: print(e) print(link[‘课程名称‘])
下面是获取网页验证码的,
‘‘‘ 获取登入界面的验证码,并保存到本地 --现在只是保存到本地中,后期再编写自动输入 ‘‘‘ import requests from lxml import etree import os login_url = ‘http://www.diaosiweb.net/member/index.php‘ headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36‘, } def get_capthca(url): login_response = requests.get(url,headers=headers) image_url = ‘http://www.diaosiweb.net‘ + etree.HTML(login_response.text).xpath(‘//img[@id="vdimgck"]/@src‘)[0].replace(‘..‘,‘‘) image_response = requests.get(image_url).content with open(‘captcha.jpg‘,‘wb‘) as f: f.write(image_response) f.close() print(‘验证码已经保存到:{}‘.format(os.getcwd()))
恩,这样差不多就完成了一个爬虫项目了,因为是第一次完整的爬取,所以写的比较乱,也没有思维图,也知道有很多地方不完善,但是发懒筋了,不想写了,先这样吧!
Python获取个人网站的所有课程下载链接和密码,并保存到Mongodb中
标签:tip cep apple bcb osi member return ext update
原文地址:http://www.cnblogs.com/114811yayi/p/6938948.html