码迷,mamicode.com
首页 > 其他好文 > 详细

模块法之内蒙古自治区环境保护厅

时间:2018-04-18 14:24:15      阅读:228      评论:0      收藏:0      [点我收藏+]

标签:web   环境保护   页面   sheet   lxml   main   etl   sse   环境   

import re,requests,xlwt
from lxml import etree
headers = {
    User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36,
}
end_list=[]
def getlinks(url):
    #获取需求下所有跳转页面href
    wb_data = requests.get(url,headers=headers)
    wb_data.encoding=wb_data.apparent_encoding
    links=re.findall(class="font_hei15_1".*?href="(.*?)",wb_data.text,re.S)
    for link in links:
        link = (http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/+link).replace(/.,‘‘)
        getinfos(link)
def getinfos(url):
    #进入详细页获取指定信息
    wb_data = requests.get(url,headers=headers)
    wb_data.encoding=wb_data.apparent_encoding
    soup = etree.HTML(wb_data.text)
    names = soup.xpath(//div[@class="xl_nr_16"])
    addresses = soup.xpath(//div[@class="xl_nr_16"])
    danweis = soup.xpath(//div[@class="xl_nr_16"])
    pingjiajigous = soup.xpath(//div[@class="xl_nr_16"])
    dates = soup.xpath(//div[@class="xl_nr_16"])
    for name,address,danwei,pingjiajigou,date in zip(names,addresses,danweis,pingjiajigous,dates):
        try:
            name = name.xpath(p/text()[5])[0] # 像p1的3 /5/ 9 /10的构造
            address = address.xpath(p/text()[6])[0]#3/5/9/10的地点
            danwei = danwei.xpath(p/text()[7])[0]
            pingjiajigou = pingjiajigou.xpath(p/text()[8])[0]
            date = date.xpath(p/text()[9])[0]
            print(name,address,danwei,pingjiajigou,date)
        except:
            try:
                name = name.xpath(div/text()[10])[0]#像p1的1/2的构造
                address = address.xpath(div/text()[12])[0]#1/2的地点
                danwei = danwei.xpath(div/text()[14])[0]
                pingjiajigou = pingjiajigou.xpath(div/text()[16])[0]
                date = date.xpath(div/text()[18])[0]
                print(name,address,danwei,pingjiajigou,date)
            except:
                try:
                    name = name.xpath(div/p[6]/text())[0]#像p1的6/13/14的构造  #已解决
                    address=address.xpath(div/p[7]/text())[0]
                    danwei = danwei.xpath(div/p[8]/text())[0]
                    pingjiajigou = pingjiajigou.xpath(div/p[9]/text())[0]
                    date = date.xpath(div/p[10]/text())[0]
                    print(name,address,danwei,pingjiajigou,date)
                except:
                    try:
                        name = name.xpath(div/p/text()[10])[0]#像p1的4/7/8/11/12/15的构造
                        address = address.xpath(div/p/text()[12])[0]
                        danwei = danwei.xpath(div/p/text()[14])[0]
                        pingjiajigou = pingjiajigou.xpath(div/p/text()[16])[0]
                        date = date.xpath(div/p/text()[18])[0]
                        print(name,address,danwei,pingjiajigou,date)
                    except:
                        try:
                            name = name.xpath(div/table/tbody/tr[2]/td[2]/p/a/text())[0] #像第二页7/8的构造
                            address = address.xpath(div/table/tbody/tr[2]/td[3]/p/span/text())[0]
                            danwei = danwei.xpath(div/table/tbody/tr[2]/td[4]/p/span/text())[0]
                            pingjiajigou = pingjiajigou.xpath(div/table/tbody/tr[2]/td[5]/p/span/text())[0]
                            list=[]
                            i = 0
                            while i < 6:
                                date5 = date.xpath(div/table/tbody/tr[2]/td[6]/p/span/text())[i]#这里得把所有元素取出来合并/这是像第二页第七项的构造
                                i +=1
                                list.append(date5)
                            date = list[0] + list[1] + list[2] + list[3] + list[4] + list[5]
                            print(name,address,danwei,pingjiajigou,date)
                        except:
                            try:
                                name = name.xpath(div/table/tbody/tr[2]/td[2]/text())[0]#像第二页9/10的构造
                                address = address.xpath(div/table/tbody/tr[2]/td[3]/p/span/text())[0]
                                danwei = danwei.xpath(div/table/tbody/tr[2]/td[4]/p/span/text())[0]
                                pingjiajigou = pingjiajigou.xpath(div/table/tbody/tr[2]/td[5]/p/span/text())[0]
                                i =0
                                list1=[]
                                while i <6:
                                    date6 = date.xpath(div/table/tbody/tr[2]/td[6]/p/span/text())[i]#这里得把所有元素取出来合并/这是像第二页第九项的构造
                                    i +=1
                                    list1.append(date6)
                                date = list1[0] + list1[1] + list1[2] + list1[3] + list1[4] + list1[5]
                                print(name,address,danwei,pingjiajigou,date)
                            except:
                                try:
                                    name = name.xpath(div/span/table/tbody/tr[2]/td[2]/p/a/text())[0]#像第二页11的构造
                                    address = address.xpath(div/span/table/tbody/tr[2]/td[3]/p/span/text())[0]
                                    danwei = danwei.xpath(div/span/table/tbody/tr[2]/td[4]/p/span/text())[0]
                                    pingjiajigou = pingjiajigou.xpath(div/span/table/tbody/tr[2]/td[5]/p/span/text())[0]
                                    i = 0
                                    list2=[]
                                    while i <6:
                                        date7 = date.xpath(div/span/table/tbody/tr[2]/td[6]/p/span/text())[i]
                                        i +=1
                                        list2.append(date7)
                                    date=list2[0] + list2[1] + list2[2] + list2[3] + list2[4] + list2[5]
                                    print(name,address,danwei,pingjiajigou,date)
                                except:
                                    try:
                                        name = name.xpath(div/span/table/tbody/tr[2]/td[2]/p/span/text())[0]
                                        address = address.xpath(div/span/table/tbody/tr[2]/td[3]/p/span/text())[0]
                                        danwei = danwei.xpath(div/span/table/tbody/tr[2]/td[4]/p/span/text())[0]
                                        pingjiajigou = pingjiajigou.xpath(div/span/table/tbody/tr[2]/td[5]/p/span/text())[0]
                                        i = 0
                                        list4 = []
                                        while i <6:
                                            date8 = date.xpath(div/span/table/tbody/tr[2]/td[6]/p/span/text())[i]#像p2 11项的构造
                                            i+=1
                                            list4.append(date8)
                                        date=list4[0] + list4[1] + list4[2] + list4[3] + list4[4] + list4[5]
                                        print(name,address,danwei,pingjiajigou,date)
                                    except:
                                        try:
                                            name = name.xpath(div/div/span/table/tbody/tr[2]/td[2]/p/span/a/text())[0]
                                            address = address.xpath(div/div/span/table/tbody/tr[2]/td[3]/p/span[1]/text())[0]
                                            danwei = danwei.xpath(div/div/span/table/tbody/tr[2]/td[4]/p/span/text())[0]
                                            pingjiajigou = pingjiajigou.xpath(div/div/span/table/tbody/tr[2]/td[5]/p/span/text())[0]
                                            list5=[]
                                            i=0
                                            while i<6:
                                                date9 = date.xpath(div/div/span/table/tbody/tr[2]/td[6]/p/span/text())[i]
                                                i+=1
                                                list5.append(date9)
                                            date=list5[0] + list5[1] + list5[2] + list5[3] + list5[4] + list5[5] #像p2 13/14项的构造
                                            print(name,address,danwei,pingjiajigou,date)
                                        except:
                                            try:
                                                name = name.xpath(div/div/span/table/tbody/tr[2]/td[2]/p/a/text())[0]
                                                address = address.xpath(div/div/span/table/tbody/tr[2]/td[3]/p/span[1]/text())[0]
                                                danwei = danwei.xpath(div/div/span/table/tbody/tr[2]/td[4]/p/span[1]/text())[0]
                                                pingjiajigou = pingjiajigou.xpath(div/div/span/table/tbody/tr[2]/td[5]/p/span[1]/text())[0]
                                                i =0
                                                list3=[]
                                                while i <6:
                                                    date10 = date.xpath(div/div/span/table/tbody/tr[2]/td[6]/p/span/text())[i]#这是像第二页15项的构造
                                                    i+=1
                                                    list3.append(date10)
                                                date=list3[0] + list3[1] + list3[2] + list3[3] + list3[4] + list3[5]
                                                print(name,address,danwei,pingjiajigou,date)
                                            except:
                                                try:
                                                    name = name.xpath(div/div/table/tbody/tr[3]/td[2]/text())[0]
                                                    address = address.xpath(div/div/table/tbody/tr[3]/td[3]/text())[0]
                                                    danwei = danwei.xpath(div/div/table/tbody/tr[3]/td[4]/text())[0]
                                                    pingjiajigou = pingjiajigou.xpath(div/div/table/tbody/tr[3]/td[5]/text())[0]
                                                    date = date.xpath(div/div/table/tbody/tr[3]/td[6]/text())[0]
                                                    print(name,address,danwei,pingjiajigou,date)
                                                except:
                                                    name=url
                                                    address=null
                                                    danwei=null
                                                    pingjiajigou=null
                                                    date=null
                                                    print(name,address,danwei,pingjiajigou,date)
        start_list=[name,address,danwei,pingjiajigou,date]
        end_list.append(start_list)
if __name__ == __main__:
    try:
        url1 = http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/index.html#这是首页
        getlinks(url1)
        url2s = [http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/index_{}.html.format(str(i))
                 for i in range(1,6)]
        for url2 in url2s:
            getlinks(url2)
        book = xlwt.Workbook(encoding=utf-8)
        sheet = book.add_sheet(getmessage)
        header=[项目名称,建设地点,建设单位,环境影响机构,受理日期]
        for h in range(len(header)):
            sheet.write(0,h,header[h])
        i = 1
        for start_list in end_list[:-3]:
            j = 0
            for message in start_list:
                sheet.write(i,j,message)
                j+=1
            i+=1
        print(写入完毕,最后三项已剔除)
        book.save(endresult.xls)
    except:
        print(wrong)
        book.save(error.xls)

 

模块法之内蒙古自治区环境保护厅

标签:web   环境保护   页面   sheet   lxml   main   etl   sse   环境   

原文地址:https://www.cnblogs.com/mayunji/p/8874283.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!