标签:web 环境保护 页面 sheet lxml main etl sse 环境
import re,requests,xlwt from lxml import etree headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36‘, } end_list=[] def getlinks(url): #获取需求下所有跳转页面href wb_data = requests.get(url,headers=headers) wb_data.encoding=wb_data.apparent_encoding links=re.findall(‘class="font_hei15_1".*?href="(.*?)"‘,wb_data.text,re.S) for link in links: link = (‘http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/‘+link).replace(‘/.‘,‘‘) getinfos(link) def getinfos(url): #进入详细页获取指定信息 wb_data = requests.get(url,headers=headers) wb_data.encoding=wb_data.apparent_encoding soup = etree.HTML(wb_data.text) names = soup.xpath(‘//div[@class="xl_nr_16"]‘) addresses = soup.xpath(‘//div[@class="xl_nr_16"]‘) danweis = soup.xpath(‘//div[@class="xl_nr_16"]‘) pingjiajigous = soup.xpath(‘//div[@class="xl_nr_16"]‘) dates = soup.xpath(‘//div[@class="xl_nr_16"]‘) for name,address,danwei,pingjiajigou,date in zip(names,addresses,danweis,pingjiajigous,dates): try: name = name.xpath(‘p/text()[5]‘)[0] # 像p1的3 /5/ 9 /10的构造 address = address.xpath(‘p/text()[6]‘)[0]#3/5/9/10的地点 danwei = danwei.xpath(‘p/text()[7]‘)[0] pingjiajigou = pingjiajigou.xpath(‘p/text()[8]‘)[0] date = date.xpath(‘p/text()[9]‘)[0] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(‘div/text()[10]‘)[0]#像p1的1/2的构造 address = address.xpath(‘div/text()[12]‘)[0]#1/2的地点 danwei = danwei.xpath(‘div/text()[14]‘)[0] pingjiajigou = pingjiajigou.xpath(‘div/text()[16]‘)[0] date = date.xpath(‘div/text()[18]‘)[0] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(‘div/p[6]/text()‘)[0]#像p1的6/13/14的构造 #已解决 address=address.xpath(‘div/p[7]/text()‘)[0] danwei = danwei.xpath(‘div/p[8]/text()‘)[0] pingjiajigou = pingjiajigou.xpath(‘div/p[9]/text()‘)[0] date = date.xpath(‘div/p[10]/text()‘)[0] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(‘div/p/text()[10]‘)[0]#像p1的4/7/8/11/12/15的构造 address = address.xpath(‘div/p/text()[12]‘)[0] danwei = danwei.xpath(‘div/p/text()[14]‘)[0] pingjiajigou = pingjiajigou.xpath(‘div/p/text()[16]‘)[0] date = date.xpath(‘div/p/text()[18]‘)[0] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(‘div/table/tbody/tr[2]/td[2]/p/a/text()‘)[0] #像第二页7/8的构造 address = address.xpath(‘div/table/tbody/tr[2]/td[3]/p/span/text()‘)[0] danwei = danwei.xpath(‘div/table/tbody/tr[2]/td[4]/p/span/text()‘)[0] pingjiajigou = pingjiajigou.xpath(‘div/table/tbody/tr[2]/td[5]/p/span/text()‘)[0] list=[] i = 0 while i < 6: date5 = date.xpath(‘div/table/tbody/tr[2]/td[6]/p/span/text()‘)[i]#这里得把所有元素取出来合并/这是像第二页第七项的构造 i +=1 list.append(date5) date = list[0] + list[1] + list[2] + list[3] + list[4] + list[5] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(‘div/table/tbody/tr[2]/td[2]/text()‘)[0]#像第二页9/10的构造 address = address.xpath(‘div/table/tbody/tr[2]/td[3]/p/span/text()‘)[0] danwei = danwei.xpath(‘div/table/tbody/tr[2]/td[4]/p/span/text()‘)[0] pingjiajigou = pingjiajigou.xpath(‘div/table/tbody/tr[2]/td[5]/p/span/text()‘)[0] i =0 list1=[] while i <6: date6 = date.xpath(‘div/table/tbody/tr[2]/td[6]/p/span/text()‘)[i]#这里得把所有元素取出来合并/这是像第二页第九项的构造 i +=1 list1.append(date6) date = list1[0] + list1[1] + list1[2] + list1[3] + list1[4] + list1[5] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(‘div/span/table/tbody/tr[2]/td[2]/p/a/text()‘)[0]#像第二页11的构造 address = address.xpath(‘div/span/table/tbody/tr[2]/td[3]/p/span/text()‘)[0] danwei = danwei.xpath(‘div/span/table/tbody/tr[2]/td[4]/p/span/text()‘)[0] pingjiajigou = pingjiajigou.xpath(‘div/span/table/tbody/tr[2]/td[5]/p/span/text()‘)[0] i = 0 list2=[] while i <6: date7 = date.xpath(‘div/span/table/tbody/tr[2]/td[6]/p/span/text()‘)[i] i +=1 list2.append(date7) date=list2[0] + list2[1] + list2[2] + list2[3] + list2[4] + list2[5] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(‘div/span/table/tbody/tr[2]/td[2]/p/span/text()‘)[0] address = address.xpath(‘div/span/table/tbody/tr[2]/td[3]/p/span/text()‘)[0] danwei = danwei.xpath(‘div/span/table/tbody/tr[2]/td[4]/p/span/text()‘)[0] pingjiajigou = pingjiajigou.xpath(‘div/span/table/tbody/tr[2]/td[5]/p/span/text()‘)[0] i = 0 list4 = [] while i <6: date8 = date.xpath(‘div/span/table/tbody/tr[2]/td[6]/p/span/text()‘)[i]#像p2 11项的构造 i+=1 list4.append(date8) date=list4[0] + list4[1] + list4[2] + list4[3] + list4[4] + list4[5] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(‘div/div/span/table/tbody/tr[2]/td[2]/p/span/a/text()‘)[0] address = address.xpath(‘div/div/span/table/tbody/tr[2]/td[3]/p/span[1]/text()‘)[0] danwei = danwei.xpath(‘div/div/span/table/tbody/tr[2]/td[4]/p/span/text()‘)[0] pingjiajigou = pingjiajigou.xpath(‘div/div/span/table/tbody/tr[2]/td[5]/p/span/text()‘)[0] list5=[] i=0 while i<6: date9 = date.xpath(‘div/div/span/table/tbody/tr[2]/td[6]/p/span/text()‘)[i] i+=1 list5.append(date9) date=list5[0] + list5[1] + list5[2] + list5[3] + list5[4] + list5[5] #像p2 13/14项的构造 print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(‘div/div/span/table/tbody/tr[2]/td[2]/p/a/text()‘)[0] address = address.xpath(‘div/div/span/table/tbody/tr[2]/td[3]/p/span[1]/text()‘)[0] danwei = danwei.xpath(‘div/div/span/table/tbody/tr[2]/td[4]/p/span[1]/text()‘)[0] pingjiajigou = pingjiajigou.xpath(‘div/div/span/table/tbody/tr[2]/td[5]/p/span[1]/text()‘)[0] i =0 list3=[] while i <6: date10 = date.xpath(‘div/div/span/table/tbody/tr[2]/td[6]/p/span/text()‘)[i]#这是像第二页15项的构造 i+=1 list3.append(date10) date=list3[0] + list3[1] + list3[2] + list3[3] + list3[4] + list3[5] print(name,address,danwei,pingjiajigou,date) except: try: name = name.xpath(‘div/div/table/tbody/tr[3]/td[2]/text()‘)[0] address = address.xpath(‘div/div/table/tbody/tr[3]/td[3]/text()‘)[0] danwei = danwei.xpath(‘div/div/table/tbody/tr[3]/td[4]/text()‘)[0] pingjiajigou = pingjiajigou.xpath(‘div/div/table/tbody/tr[3]/td[5]/text()‘)[0] date = date.xpath(‘div/div/table/tbody/tr[3]/td[6]/text()‘)[0] print(name,address,danwei,pingjiajigou,date) except: name=url address=‘null‘ danwei=‘null‘ pingjiajigou=‘null‘ date=‘null‘ print(name,address,danwei,pingjiajigou,date) start_list=[name,address,danwei,pingjiajigou,date] end_list.append(start_list) if __name__ == ‘__main__‘: try: url1 = ‘http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/index.html‘#这是首页 getlinks(url1) url2s = [‘http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/index_{}.html‘.format(str(i)) for i in range(1,6)] for url2 in url2s: getlinks(url2) book = xlwt.Workbook(encoding=‘utf-8‘) sheet = book.add_sheet(‘getmessage‘) header=[‘项目名称‘,‘建设地点‘,‘建设单位‘,‘环境影响机构‘,‘受理日期‘] for h in range(len(header)): sheet.write(0,h,header[h]) i = 1 for start_list in end_list[:-3]: j = 0 for message in start_list: sheet.write(i,j,message) j+=1 i+=1 print(‘写入完毕,最后三项已剔除‘) book.save(‘endresult.xls‘) except: print(‘wrong‘) book.save(‘error.xls‘)
标签:web 环境保护 页面 sheet lxml main etl sse 环境
原文地址:https://www.cnblogs.com/mayunji/p/8874283.html