标签:EDA less chrome random submit header ict Nid stat
今天终于实现了爬虫,爬取到了一定的信息
代码:
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import bs4
from selenium import webdriver
from time import sleep
import time
from PIL import Image
class Login(object):
def __init__(self):
options=webdriver.ChromeOptions()
options.add_argument(‘--headless‘)
options.add_argument(‘--disable-gpu‘)
options.add_argument(‘--ignore-certificate-errors‘)
options.add_argument("--disable-gpu")
self.driver=webdriver.Chrome(options=options)
self.driver.maximize_window()
self.driver.set_window_size(‘1920‘,‘1080‘) #设置浏览器宽480,高800
self.driver.get(‘http://tiedao.vatuu.com/service/login.html?returnUrl=return‘)
cookie = self.driver.get_cookies()
print(type(cookie))
print(cookie[0][‘value‘])
self.cookie = cookie[0][‘value‘]
print(type(self.cookie))
self.headers = {
‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘,
‘Accept-Encoding‘: ‘gzip, deflate‘,
‘Accept-Language‘: ‘zh-CN,zh;q=0.9,en;q=0.8‘,
‘Connection‘: ‘keep-alive‘,
‘Cookie‘: self.cookie,
‘Host‘: ‘tiedao.vatuu.com‘,
‘Upgrade-Insecure-Requests‘: ‘1‘,
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko)Chrome/72.0.3626.121 Safari/537.36‘
}
print(self.headers)
self.url = ‘http://tiedao.vatuu.com/vatuu/StudentScoreInfoAction?setAction=studentScoreQuery&viewType=\
studentScore&orderType=submitDate&orderValue=desc‘
# 时间格式进行格式化
def time_format(self):
current_time = time.strftime(‘%Y%m%d%H%M%S‘, time.localtime(time.time()))
return current_time
def cut(self):
# 截取全屏
self.driver.save_screenshot("C:\\Users\\kang\\Desktop\\pachong\\full.png")
# 要截屏的目标元素
element = self.driver.find_element_by_id("randomPhoto")
print(element.location)
print(element.size)
# 获取element的顶点坐标
xPiont = element.location[‘x‘]
yPiont = element.location[‘y‘]
# 获取element的宽、高
element_width = xPiont + element.size[‘width‘]
element_height = yPiont + element.size[‘height‘]
picture = Image.open(‘C:\\Users\\kang\\Desktop\\pachong\\full.png‘)
‘‘‘
crop()-- 一个显式的参数:一个4元组
Image.crop(box=None):图像返回一个矩形区域,box是一个四元组 限定所述左,上,右,和下像素坐标
参数:box--裁剪矩形,作为(左,上,右,下)-tuple;返回类型:Image;返回:一个Image对象
所以你应该重写它:
img.crop((414,122,650,338))
# ^ 4-tuple ^
‘‘‘
picture = picture.crop((xPiont, yPiont, element_width-30, element_height))
src = "C:\\Users\\kang\\Desktop\\pachong\\"+self.time_format()+".png"
picture.save(src)
def login(self):
self.cut()
user = self.driver.find_element_by_id(‘username‘)
passwor = self.driver.find_element_by_id(‘password‘)
ranstring = self.driver.find_element_by_id(‘ranstring‘)
confirm = self.driver.find_element_by_id(‘submit2‘)
name = input("请输入:username")
password = input("请输入:password")
ran = input("请输入:验证码")
user.send_keys(name)
passwor.send_keys(password)
ranstring.send_keys(ran)
confirm.click()
def get_soup(url,cookies=‘‘):
if cookies:
html = get_html(url,cookies)
soup = BeautifulSoup(html, ‘lxml‘)
return soup
else:
html = get_html(url,‘‘)
soup = BeautifulSoup(html, ‘lxml‘)
return soup
def html_utf(content):
html=content
html_doc=str(html,‘utf-8‘) #html_doc=html.decode("utf-8","ignore")
return html_doc
def get_html(url,cookies):
try:
r = requests.get(url, timeout=5, cookies=cookies)
r.raise_for_status()
return html_utf(r.content)
except:
return "ERROR"
def score(url,cookie):
# print(get_html(url,{‘JSESSIONID‘:cookie}))
soup = get_soup(url,{‘JSESSIONID‘:cookie})
table = soup.find(‘table‘, attrs={‘id‘:‘table3‘})
print(table)
trs = table.find_all(‘tr‘)
contents = []
for tr in trs:
ths = tr.find_all(‘th‘)
if ths:
for th in ths:
print(th.string.strip(),‘ | ‘,end=‘‘)
print(‘‘)
else:
content = []
tds = tr.find_all(‘td‘)
for td in tds:
if td.string is None:
pass
else:
str = td.string.strip()
content.append(str)
print(str,‘ | ‘,end=‘‘)
contents.append(content)
print(‘‘)
for con in contents:
for c in con:
print(c,‘||‘,end=‘‘)
print(‘‘)
def course(url,cookie):
soup = get_soup(url,{‘JSESSIONID‘:cookie})
table = soup.find(‘table‘,attrs={‘class‘:‘table_border‘})
trs = table.find_all(‘tr‘)
contents = []
for tr in trs:
tds = tr.find_all(‘td‘)
for td in tds:
res = td.find_all(text=True)
strs = ‘‘
for s in res:
strs+=s
# print(strs)
contents.append(strs)
print(‘+++++++++++++++++++++++++++++‘)
‘‘‘
classList 按时间顺序排好的课程
‘‘‘
classList = []
for i in range(1,8):
for j in range(1,13):
content = ‘星期‘+str(i)+‘:第‘+str(j)+‘节:‘+contents[j*8+i]
print(content)
classList.append(content)
def room(url,cookie):
‘‘‘
首先构建表单
‘‘‘
week = input(‘请输入周数:‘)
day_num = input(‘请输入星期几:‘)
class_num = input(‘请输入:\
1:第一二节\
2:第三四节\
3:整个上午\
4:第六七节\
5:第八九节\
6:整个下午\
7:晚自习‘)
week = pow(2,int(week)-1)
day_time = ‘‘
if class_num == ‘1‘:
day_time=‘0000000000011‘
elif class_num == ‘2‘:
day_time=‘0000000001100‘
elif class_num == ‘3‘:
day_time=‘0000000001111‘
elif class_num == ‘4‘:
day_time=‘0000001100000‘
elif class_num == ‘5‘:
day_time=‘0000110000000‘
elif class_num == ‘6‘:
day_time=‘0000111100000‘
elif class_num == ‘7‘:
day_time=‘0110000000000‘
param = {‘setAction‘: ‘classroomQuery‘,
‘PageAction‘: ‘Query‘,
‘day_time_text‘: day_time,
‘school_area_code‘: ‘1‘,
‘building‘: ‘‘,
‘week_no‘: str(week),
‘day_no‘: str(int(day_num)),
‘B1‘: ‘查询‘
}
cookie_copy = ‘JSESSIONID=‘+cookie
print(param)
headers = {
‘Connection‘: ‘keep-alive‘,
‘Cache-Control‘: ‘max-age=0‘,
‘Origin‘: ‘http://tiedao.vatuu.com‘,
‘Upgrade-Insecure-Requests‘: ‘1‘,
‘Content-Type‘: ‘application/x-www-form-urlencoded‘,
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36‘,
‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,\
image/apng,*/*;q=0.8‘,
‘Referer‘: ‘http://tiedao.vatuu.com/vatuu/CourseAction‘,
‘Accept-Encoding‘: ‘gzip, deflate‘,
‘Accept-Language‘: ‘zh-CN,zh;q=0.9,en;q=0.8‘,
‘Cookie‘: cookie_copy
}
r = requests.post(url,data=param,headers=headers)
soup = BeautifulSoup(html_utf(r.content), ‘lxml‘)
table = soup.find(‘table‘,attrs={‘class‘:‘table_gray‘})
trs = table.find_all(‘tr‘)
contents = []
for tr in trs:
ths = tr.find_all(‘th‘)
if ths:
for th in ths:
print(th.string.strip(),‘ | ‘,end=‘‘)
print(‘‘)
else:
content = []
tds = tr.find_all(‘td‘)
for td in tds:
res = td.find_all(text=True)
strs = ‘‘
for s in res:
strs+=s
strs = strs.strip()
print(strs,‘ || ‘,end=‘‘)
contents.append(strs)
print(‘+++++++++++++++++++++++++++++‘)
if __name__ == ‘__main__‘:
login = Login()
# login.login()
time.sleep(2)
course_url = ‘http://tiedao.vatuu.com/vatuu/CourseAction?setAction=userCourseScheduleTable\
&viewType=studentQueryCourseList&selectTableType=ThisTerm&queryType=student‘
room_url = ‘http://tiedao.vatuu.com/vatuu/CourseAction‘
# score(login.url,login.cookie)
print(login.cookie)
cookie=‘8CD73FB791382490DA6F32187893B80E‘
course(course_url,cookie)
for i in range(1,100):
room(room_url,cookie)
下一步准备进行遍历这些数据,展示空教室信息。
标签:EDA less chrome random submit header ict Nid stat
原文地址:https://www.cnblogs.com/shumouren/p/13090460.html