1 #-*- coding:utf-8 -*- 2 import gzip 3 import re 4 import http.cookiejar 5 import urllib.request 6 import urllib.parse 7 import xlwt 8 import time,os 9 10 11 12 def saveexcel(flow,filename,coding=‘gbk‘): 13 #flow 需要转换为excel的里面,格式为双层列表 14 #coding excel页面编码 15 try: 16 workbook = xlwt.Workbook(encoding=coding) 17 sheet = workbook.add_sheet(‘Sheet1‘) 18 for row,rowdata in enumerate(flow): 19 for col,val in enumerate(rowdata): 20 sheet.write(row,col,val.strip(),style = xlwt.Style.default_style) 21 excelname = ‘\\%s.xls‘%filename 22 workbook.save(excelname) 23 return excelname 24 25 except Exception as e: 26 if hasattr(e,"code"): 27 print (‘excel写入失败,错误原因‘ +str(e.code)) 28 if hasattr(e,"reason"): 29 print (‘excel写入失败,错误原因‘ +str(e.reason)) 30 return None 31 32 #从指定页面中取表单参数 33 def getParm(data,parm): 34 cer = re.compile(‘name="‘+parm+‘".* value="(.*?)"‘, flags = 0) 35 strlist = cer.findall(data) 36 37 if strlist: 38 return strlist[0] 39 else: 40 return None 41 42 def getOpener(): 43 #自动设置COOKIER 44 # deal with the Cookies 45 print( ‘正在设置cookie‘) 46 cj = http.cookiejar.CookieJar() 47 pro = urllib.request.HTTPCookieProcessor(cj) 48 opener = urllib.request.build_opener(pro, urllib.request.HTTPHandler) 49 urllib.request.install_opener(opener) 50 print( ‘设置cookie成功‘) 51 return opener 52 53 54 header = { 55 ‘Connection‘: ‘Keep-Alive‘, 56 ‘Accept‘: ‘text/html, application/xhtml+xml, */*‘, 57 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko‘ 58 } 59 60 url = ‘‘ 61 header[‘Referer‘]=‘‘ 62 #1、设置Cookie 63 opener = getOpener() 64 65 66 ##2、初始化数据开始 67 request = urllib.request.Request(url) 68 try: 69 html = urllib.request.urlopen(request).read() 70 #取表单参数 71 EVENTVALIDATION = getParm(html.decode(‘gbk‘),‘__EVENTVALIDATION‘) 72 VIEWSTATEGENERATOR = getParm(html.decode(‘gbk‘),‘__VIEWSTATEGENERATOR‘) 73 VIEWSTATE = getParm(html.decode(‘gbk‘),‘__VIEWSTATE‘) 74 btnsubmit = getParm(html.decode(‘gbk‘),‘sbtnSubmit‘) 75 76 except urllib.request.URLError as e: 77 if hasattr(e,"code"): 78 print (‘请求页面失败,请检查网络设置,错误原因‘ +str(e.code)) 79 if hasattr(e,"reason"): 80 print (‘请求页面失败,请检查网络设置,错误原因‘ +str(e.reason)) 81 #取表单参数结束 82 83 84 id = ‘***‘ 85 password = ‘***‘ 86 postDict = { 87 ‘LoginID‘:id, 88 ‘Pwd‘:password, 89 ‘__EVENTVALIDATION‘:EVENTVALIDATION, 90 ‘__VIEWSTATEGENERATOR‘:VIEWSTATEGENERATOR, 91 ‘__VIEWSTATE‘:VIEWSTATE, 92 ‘btnSubmit‘:btnsubmit 93 } 94 postData = urllib.parse.urlencode(postDict).encode(encoding=‘UTF8‘) 95 96 ##3、正式登录 97 request = urllib.request.Request(url, postData,headers=header) 98 try: 99 response = urllib.request.urlopen(request) 100 data = response.read() 101 except urllib.request.URLError as e: 102 if hasattr(e,"code"): 103 print (‘页面加载失败,请检查网络及账号设置,错误原因‘ +str(e.code)) 104 if hasattr(e,"reason"): 105 print (‘页面加载失败,请检查网络及账号设置,错误原因‘ +str(e.reason)) 106 107 #登录结束 108 print(‘login:‘,data.decode(‘gbk‘)) 109 110 ##4 进入产品搜索界面进行数据提取 111 112 posturl= ‘‘ 113 request = urllib.request.Request(posturl) 114 try: 115 html = urllib.request.urlopen(request).read() 116 #取表单参数 117 EVENTVALIDATION = getParm(html.decode(‘gbk‘),‘__EVENTVALIDATION‘) 118 VIEWSTATEGENERATOR = getParm(html.decode(‘gbk‘),‘__VIEWSTATEGENERATOR‘) 119 VIEWSTATE = getParm(html.decode(‘gbk‘),‘__VIEWSTATE‘) 120 #btnsearch = getParm(html.decode(‘gbk‘),‘btnSearcht‘) 121 except urllib.request.URLError as e: 122 if hasattr(e,"code"): 123 print (‘请求页面失败,请检查网络设置,错误原因‘ +str(e.code)) 124 if hasattr(e,"reason"): 125 print (‘请求页面失败,请检查网络设置,错误原因‘ +str(e.reason)) 126 127 128 postDict[‘__EVENTTARGET‘]=‘‘ 129 postDict[‘__EVENTARGUMENT‘]=‘‘ 130 postDict[‘__EVENTVALIDATION‘]=EVENTVALIDATION 131 postDict[‘__VIEWSTATEGENERATOR‘]=VIEWSTATEGENERATOR 132 postDict[‘__VIEWSTATE‘]=VIEWSTATE 133 postDict[‘PName‘]=‘‘ 134 postDict[‘PID‘]=‘‘ 135 postDict[‘txtStartDate‘]=‘2016-01-01‘ 136 postDict[‘txtEndDate‘]=‘2016-01-31‘ 137 postDict[‘ConvertToExcel.x‘]=‘6‘ 138 postDict[‘ConvertToExcel.y‘]=‘9‘ 139 postDict[‘btnSearch‘]=‘‘ 140 postData = urllib.parse.urlencode(postDict).encode(encoding=‘UTF8‘) 141 142 print( ‘搜索页面数据获取成功,正在抓取流向数据...‘) 143 144 145 ###登录搜索页面 146 request = urllib.request.Request(posturl, postData,headers=header) 147 try: 148 response = urllib.request.urlopen(request) 149 data = response.readlines() 150 except urllib.request.URLError as e: 151 if hasattr(e,"code"): 152 print (‘页面加载失败,请检查网络及账号设置,错误原因‘ +str(e.code)) 153 if hasattr(e,"reason"): 154 print (‘页面加载失败,请检查网络及账号设置,错误原因‘ +str(e.reason)) 155 print( ‘流向抓取成功,正在保存为excel...‘) 156 print(‘search:‘,data) 157 ##5 保存为excel 158 159 workbook = xlwt.Workbook(encoding=‘gbk‘) 160 sheet = workbook.add_sheet(‘Sheet1‘) 161 for row,rowdata in enumerate(data): 162 rowdata_list = rowdata.decode(‘gbk‘).split(‘\t‘) 163 for col,val in enumerate(rowdata_list): 164 sheet.write(row,col,val,style = xlwt.Style.default_style) 165 166 ntime = time.strftime(‘%Y%m%d%H%M%S‘) 167 excelname = ntime+‘%s.xls‘%‘宁波宝瑞达‘ 168 workbook.save(excelname) 169 print( ‘excel导出成功,请查看程序目录下%s文件。‘%excelname)