标签:
#coding:gbk import sys,re import urllib.request #from bs4 import BeautifulSoup def getId(url,i): postdata = urllib.parse.urlencode({‘currentPage‘: i}) postdata = postdata.encode(‘utf-8‘) page = urllib.request.urlopen(url,postdata) html =str(page.read(),"utf-8") reg=re.compile(r"\[<a .*id=\"(\d+)\">查看.*\]") return reg.findall(html) #=============================================================================== # def getInfo(url): # page = urllib.request.urlopen(url) # html =str(page.read(),"utf-8") # return html #=============================================================================== fname = "C:/Users/Songxiaodi/Desktop/result.txt" file = open(fname, ‘w‘) for i in range(1,3): #页数,根据需要填写,也可以通过正则从网页中分析出来。 try: tt = getId("http://shixin.court.gov.cn/unitMore.do",i) print (tt) for k in tt: #k 是detail页面的id,每页的id html_value="http://shixin.court.gov.cn/detail?id="+k html=str(urllib.request.urlopen(html_value).read(),"utf-8") html=html.replace("\\n","") print(html) html=eval(html) #字符串转dict file.write(str(html["id"])+";"+html["iname"]+";"+html["caseCode"]+";"+ html["cardNum"]+";"+html["businessEntity"]+";"+html["courtName"]+";" +html["areaName"]+";"+html["partyTypeName"]+";"+html["gistId"]+";"+html["regDate"]+";" +html["gistUnit"]+";"+html["duty"]+";"+html["performance"]+";"+html["disruptTypeName"]+";"+html["publishDate"]) file.write("\n") except Exception as err: print(err) file.close()
标签:
原文地址:http://www.cnblogs.com/maomaoxiyu/p/4348778.html