标签:gecko pytho class XML com 编码 accept encoding name
‘‘‘ @author:zl @contact: @site: https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,2.html ‘‘‘ # _*_ coding:utf-8 _*_ import requests from bs4 import BeautifulSoup import re import time from pymongo import MongoClient import xlwt headers = { ‘user-agent‘: "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" , ‘accept‘: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", ‘accept-encoding‘: "gzip, deflate, br", ‘accept-language‘: "zh-CN,zh;q=0.9", ‘cache-control‘: "max-age=0", ‘upgrade-insecure-requests‘: "1", ‘Connection‘: ‘keep-alive‘, ‘Host‘: "search.51job.com", } # 获取源码 def get_content(page): url = ‘https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,‘+str(page)+‘.html‘ req = requests.get(url,headers=headers) html = req.content.decode(‘gbk‘) return html # 获取字段 def get(html): reg = re.compile(r‘<p class="t1 ">.*?<a target="_blank" title="(.*?)" .*?<span class="t2"><a target="_blank" title="(.*?)" .*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>‘,re.S) items = re.findall(reg,html) return items # 爬到的内容写入excel def excel_write(items,index): for item in items: # 职位信息 for i in range(0,5): print(item[i]) ws.write(index, i, item[i]) # 行,列,数据 index+=1 if __name__ == ‘__main__‘: newTable = "test.xls" # 表格名称 wb = xlwt.Workbook(encoding=‘utf-8‘) # 创建excel文件,声明编码 ws = wb.add_sheet(‘sheet1‘) # 创建表格 headData = [‘招聘职位‘, ‘公司‘, ‘地址‘, ‘薪资‘, ‘日期‘] # 表头信息 for colnum in range(0,5): ws.write(0,colnum,headData[colnum],xlwt.easyxf(‘font: bold on‘)) # 多页处理,下载到文件 for each in range(1,10): index = (each-1)*50+1 excel_write(get(get_content(each)),index) wb.save(newTable)
标签:gecko pytho class XML com 编码 accept encoding name
原文地址:https://www.cnblogs.com/zhanglin123/p/9203132.html