标签:show data res yield text port accept nav .com
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request from scrapy import FormRequest class Login1Spider(scrapy.Spider): name = 'login1' allowed_domains = ['github.com'] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://github.com/', 'Content-Type': 'application/x-www-form-urlencoded', } start_urls = ['https://github.com/758109577'] def start_requests(self): urls = ['https://github.com/login'] for url in urls: yield Request(url, meta={'cookiejar': 1}, callback=self.github_login) def github_login(self, response): # 首先获取authenticity_token,这里可以借助scrapy shell ”url“来获取页面 # 然后从源码中获取到authenticity_token的值 authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first() self.logger.info('authenticity_token=' + authenticity_token) # url可以从fiddler抓取中获取,dont_click作用是如果是True,表单数据将被提交,而不需要单击任何元素。 return FormRequest.from_response(response, url='https://github.com/session', meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, formdata={'utf8': '?', 'authenticity_token': authenticity_token, 'login': 'aaaaaa@qq.com', 'password': 'xxxxxx'}, callback=self.github_after, dont_click=True, ) def github_after(self, response): # 获取登录页面主页中的字符串'Browse activity' list = response.xpath("//a[@class='UnderlineNav-item selected']/text()").extract() # 如果含有字符串,则打印日志说明登录成功 if 'Browse activity' in list: self.logger.info('我已经登录成功了,这是我获取的关键字:Browse activity') for url in self.start_urls: yield Request(url=url, callback=self.show) def show(self,response): print("############################") list = response.xpath("//span[@class='p-nickname vcard-username d-block']/text()").extract() if 'aaaaaa' in list: print(list) print("############################") else: print("失败")标签:show data res yield text port accept nav .com
原文地址:http://blog.51cto.com/haoyonghui/2140888