人行征信第三张报告的信息提取

时间：2017-12-29 15:05:20 阅读：245 评论：0 收藏：0 [点我收藏+]

标签：red amp pat tree count use order reason veh

提取第三张报告的各种信息。使用正则和xpath方法。

# coding=utf8

import re,json,os
from lxml import etree
from collections import OrderedDict
from common import html,LoggerUntil,handle_parse_exception
from html_processor import  HtmlProcessor

logger = LoggerUntil(name="crcc_paser").getlog(logfilename=‘crcc_paser.log‘,loglevel=2,add_StreamHandler=1)
class SaveHtmlFileMixin(object):
    def save_to_file(self):
        f = open(self.create_file(),‘wb‘)
        f.write(self.html)

    def create_file(self):
        if not os.path.exists(‘htmldir‘):
            os.mkdir(‘htmldir‘)
        html_file_name = ‘htmldir‘ + ‘/‘ + self.name + ‘.html‘
        return  html_file_name


class CrccPaser(SaveHtmlFileMixin):
    def __init__(self,html,name):
        self.html = html
        self.name = name
        self.data = OrderedDict()
        self.data[‘name‘] = name
        self.selector = None
        self.text =  self._get_text()
        self.get_selector()

    def _get_text(self):
        text = self.html.decode(‘utf8‘)
        return text

    def get_selector(self):
        self.selector = etree.HTML(self.text)

    def extract_user_info(self):
        self.data[‘report_no‘] = re.search(u‘报告编号：(.*?)</strong>‘, self.text).group(1).strip()
        self.data[‘query_time‘] = re.search(u‘查询时间：(.*?)</strong>‘, self.text).group(1).strip()
        self.data[‘report_time‘] = re.search(u‘报告时间：(.*?)</strong>‘, self.text).group(1).strip()
        self.data[‘crcc_name‘] = re.search(u‘姓名：(.*?)</strong>‘, self.text).group(1).strip()
        self.data[‘id_type‘] = re.search(u‘证件类型：(.*?)</strong>‘, self.text).group(1).strip()
        self.data[‘id_no‘] = re.search(u‘证件号码：(.*?)</strong>‘, self.text).group(1).strip()


    def extract_summary_information(self):
        account_num = re.search(
            u‘<tr>\s*?<td align="left" class="p">\s*?&nbsp;账户数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>‘,
            self.text).groups()
        self.data[‘account_num‘] = self._init_num_dict(account_num)

        uncleared_num = re.search(
                u‘<tr>\s*?<td align="left" class="p">\s*?&nbsp;&nbsp;&nbsp;未结清/未销户账户数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>‘,
                self.text).groups()
        self.data[‘uncleared_num‘] = self._init_num_dict(uncleared_num)

        overdue_num = re.search(
                u‘<tr>\s*?<td align="left" class="p">\s*?&nbsp;发生过逾期的账户数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>‘,
                self.text).groups()
        self.data[‘overdue_num‘] = self._init_num_dict(overdue_num)

        overdue90_num = re.search(
                u‘<tr>\s*?<td align="left" class="p">\s*?&nbsp;&nbsp;&nbsp;发生过90天以上逾期的账户数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>‘,
                self.text).groups()
        self.data[‘overdue90_num‘] = self._init_num_dict(overdue90_num)

        assure_num = re.search(
                u‘<tr>\s*?<td align="left" class="p">\s*?&nbsp;为他人担保笔数\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?<td align="center" class="p">\s*(.*?)\s*?</td>\s*?</tr>‘,
                self.text).groups()
        self.data[‘assure_num‘] = self._init_num_dict(assure_num)

    @staticmethod
    def _init_num_dict(num_tuple):
        num_dict = {}
        num_dict[‘credit_card‘],num_dict[‘ home_loans‘],num_dict[‘other_loans‘] = num_tuple
        return  num_dict

    def extract_all_loan_information(self):
        all_loan_information = re.findall(u‘<li\s*?style="list-style-type: decimal; list-style-position: outside">\s*?(\S*?)\s*?</li>‘, self.text)
        self.data[‘all_loan_information‘] = all_loan_information

    def extract_public_records(self):
        if not  re.search(u‘系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。‘,self.text):
            public_records=self.selector.xpath(‘//table[@align="center"]//table[5]/tbody/tr[3]/td‘)[0].strip()   # 还不能确定具体格式,有可能造成解析中断出错   #TODO
        else:
            public_records=[u‘系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。‘]
        self.data[‘public_records‘] = public_records


    def extract_query_records(self):
        if  not  re.search(u‘系统中没有您的信用报告最近2年被查询的记录。‘,self.text):
            #query_records=selector.xpath(‘//table[@align="center"]//table[6]/tbody/tr[3]/td‘)[0].strip()
            query_records = re.findall(
                u‘<tr align="center">[\s\S]*?td class="p">\s*(.*?)\s*?</td>[\s\S]*?<td class="p">\s*(.*?日)\s*?</td>[\s\S]*?<td class="p">\s*(.*?)\s*?</td>[\s\S]*?<td class="p">\s*(.*?)\s*?</td>[\s\S]*?</tr>‘,
                self.text)
            query_records = self._init_query_records(query_records)
        else:
            query_records =[u‘系统中没有您的信用报告最近2年被查询的记录。‘]

        self.data[‘query_records‘] = query_records


    @staticmethod
    def _init_query_records(query_records):
        """
        :type query_records : list
        """
        query_records_list = []
        for record_tuple in query_records:
            query_record_dict = OrderedDict()
            query_record_dict[‘no‘], query_record_dict[‘query_date‘], query_record_dict[‘query_person‘],query_record_dict[‘query_reason‘]  = record_tuple
            query_records_list.append(query_record_dict)
        return query_records_list

    @handle_parse_exception
    def extract_all(self):
        self.extract_user_info()
        self.extract_summary_information()
        self.extract_all_loan_information()
        self.extract_public_records()
        self.extract_query_records()



def extract_crcc(html_str,name):

    htmlProcessor = HtmlProcessor(html_str,name)
    htmlProcessor.save_to_file()   # 保存html文件


    crccPaser = CrccPaser(html_str, name)
    # crccPaser.save_to_file()
    crccPaser.extract_all()
    logger.info(json.dumps(crccPaser.data, ensure_ascii=False)) # TODO
    return json.dumps(crccPaser.data, ensure_ascii=False)

if __name__ == ‘__main__‘:
    extract_crcc(html,‘小明5‘)

其中html第三张报告的页面源码字符串。

结果是

{"name": "小明5", "report_no": "2017122200004891965680", "query_time": "2017.12.22 11:12:32", "report_time": "2017.12.22 18:38:18", "crcc_name": "小明5", "id_type": "身份证", "id_no": "**************4337", "account_num": {" home_loans": "0", "other_loans": "2", "credit_card": "0"}, "uncleared_num": {" home_loans": "0", "other_loans": "0", "credit_card": "0"}, "overdue_num": {" home_loans": "0", "other_loans": "0", "credit_card": "0"}, "overdue90_num": {" home_loans": "0", "other_loans": "0", "credit_card": "0"}, "assure_num": {" home_loans": "0", "other_loans": "0", "credit_card": "0"}, "all_loan_information": ["2010年8月23日国家开发银行湖北省分行发放的6,000元（人民币）个人助学贷款，2014年10月已结清。", "2009年11月19日国家开发银行湖北省分行发放的6,000元（人民币）个人助学贷款，2014年10月已结清。"], "public_records": ["系统中没有您最近5年内的欠税记录、民事判决记录、强制执行记录、行政处罚记录及电信欠费记录。"], "query_records": [{"no": "1", "query_date": "2017年12月4日", "query_person": "本人", "query_reason": "本人查询（互联网个人信用信息服务平台）"}, {"no": "2", "query_date": "2017年11月20日", "query_person": "本人", "query_reason": "本人查询（互联网个人信用信息服务平台）"}, {"no": "3", "query_date": "2017年11月6日", "query_person": "本人", "query_reason": "本人查询（互联网个人信用信息服务平台）"}, {"no": "4", "query_date": "2017年10月20日", "query_person": "本人", "query_reason": "本人查询（互联网个人信用信息服务平台）"}, {"no": "5", "query_date": "2017年10月10日", "query_person": "本人", "query_reason": "本人查询（互联网个人信用信息服务平台）"}, {"no": "6", "query_date": "2017年9月27日", "query_person": "本人", "query_reason": "本人查询（互联网个人信用信息服务平台）"}, {"no": "7", "query_date": "2017年9月18日", "query_person": "本人", "query_reason": "本人查询（互联网个人信用信息服务平台）"}]}

人行征信第三张报告的信息提取

标签：red amp pat tree count use order reason veh

原文地址：https://www.cnblogs.com/ydf0509/p/8143857.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行