码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫案例-爬运维工单

时间:2020-02-24 20:14:37      阅读:105      评论:0      收藏:0      [点我收藏+]

标签:nec   tom   none   date   gic   pos   get   inter   ==   

 

源代码:

# coding=utf-8
import requests
from lxml import etree


class ChaxunSpdier:
    def __init__(self):
        self.start_url = http://111.40.232.237:9000/eoms35/sheet/complaint/complaint.do?method=performQuery
        self.part_url = http://111.40.232.237:9000/eoms35/sheet/complaint/
        self.headers = {
            Connection: keep-alive,
            Cookie: TSJSESSIONID=0000YvxNFfPYx8EBo8lsKNrKIl6:1bkt8lo7d,#每次都得换一下
            Host: 111.40.232.237:9000,
            Referer: http://111.40.232.237:9000/eoms35/sheet/complaint/complaint.do?method=showQueryPage&type=interface&urlType=complaint&userName=liuhaoce&workSerial=0&isDutyMaster=false&workSerialTime=&startDuty=&endDuty=,
            User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36}

    def parse_url(self, url):
        formdata = {
            sheetIdStringExpression: like,
            main.sheetId: ‘‘,  # 工单流水号
            titleStringExpression: like,
            main.title: ‘‘,
            main.status: ‘‘,
            statusChoiceExpression: 0,
            task.taskName: ‘‘,
            sendRoleIdStringExpression: in,
            main.sendRoleId: ‘‘,
            sendDeptIdStringExpression: in,
            main.sendDeptId: ‘‘,
            sendUserIdStringExpression: in,
            main.sendUserId: ‘‘,
            operateRoleIdStringExpression: in,
            link.operateRoleId: ‘‘,
            operateDeptIdStringExpression: in,
            link.operateDeptId: ‘‘,
            operateUserIdStringExpression: in,
            link.operateUserId: ‘‘,
            toDeptIdStringExpression: in,
            showArea: 大庆, 铁通,  # 投诉受理省份
            main.toDeptId: 1005, 1021,
            main.complaintType1: ‘‘,
            complaintType1ChoiceExpression: 1010615100202,  # 投诉类型一:家宽业务
            main.complaintType2: ‘‘,
            complaintType2ChoiceExpression: ‘‘,
            main.complaintType: ‘‘,
            main.complaintType4: ‘‘,
            main.complaintType5: ‘‘,
            main.complaintType6: ‘‘,
            main.complaintType7: ‘‘,
            complaintNumStringExpression: ‘‘,
            main.complaintNum: ‘‘,
            parentCorrelationStringExpression: ‘‘,
            main.parentCorrelation: ‘‘,
            customAttributionStringExpression: like,
            main.customAttribution: ‘‘,
            repeatComplaintTimesStringExpression: >=,
            main.repeatComplaintTimes: ‘‘,
            complaintDescStringExpression: like,
            main.complaintDesc: ‘‘,
            main.sendTime: ‘‘,
            sendTimeStartDateExpression: >=,
            sendTimeStartDate: 2020-02-02 20:13:35,  # 开始时间
            sendTimeLogicExpression: and,
            sendTimeEndDateExpression: <=,
            sendTimeEndDate: 2020-02-23 20:13:35,  # 结束时间
            queryType: record
        }
        response = requests.post(url, data=formdata, headers=self.headers)
        return response.content

    def get_content_list(self, html_raw):
        html = etree.HTML(html_raw)
        tr_list = html.xpath(//tbody/tr)  # 每一个tr里放了一行投诉
        content_list = []
        for content in tr_list:
            item = {}
            zineirong = content.xpath(./td)  # 每行投诉都封装在n个td标签下
            item[工单主题] = zineirong[0].xpath(.//text())[0]
            item[工单流水号] = zineirong[1].xpath(./a/text())[0]
            # item[‘处理时限‘] = zineirong[3].xpath(‘./text()‘)[0]
            detail_link = self.part_url + zineirong[1].xpath(./a/@href)[0]
            detail_dict = self.get_gongdan_detail(detail_link)
            item[xiangqing] = detail_dict
            content_list.append(item)
        next_gongdan_url = self.part_url + html.xpath("//a[text()=‘下一页‘]/@href")[0] if len(html.xpath("//a[text()=‘下一页‘]/@href")) > 0 else None  # 下一页工单列表明细
        return content_list, next_gongdan_url

    def get_gongdan_detail(self, url):
        html_raw = self.parse_url(url)
        html = etree.HTML(html_raw)
        xiangqing_dict = {}
        xiangqing_dict[投诉内容] = html.xpath(//*[@id="complainttext"]/text())
        xiangqing_dict[派往对象] = html.xpath(//div[@id="ext-gen47"]/table/tbody/tr[4]/td[4]/text())#ifram里了,查不到
        xiangqing_dict[qita] = html.xpath(//*[@id="ext-gen47"]/text())

        return xiangqing_dict

    def save_content_list(self, content_list):
        for i, v in enumerate(content_list, start=1):
            print(i, v)

    def run(self):
        next_url = self.start_url#工单查询主界面
        content_total_list = []
        while next_url is not None:
            html_raw = self.parse_url(next_url)  # 获取访问每一页工单源数据
            content_list, next_url = self.get_content_list(html_raw)  # 提取url具体内容放在里列表里,获取下一页链接
            content_total_list = content_total_list + content_list  #将提取每一页内容加载到列表中
        self.save_content_list(content_total_list)  # 每一条工单内容打印一下

if __name__ == __main__:
    Spdier = ChaxunSpdier()
    Spdier.run()

 

爬虫案例-爬运维工单

标签:nec   tom   none   date   gic   pos   get   inter   ==   

原文地址:https://www.cnblogs.com/iamorz/p/12358379.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!