标签:blog pes pip sea 提取 结构化 组织 pat index
以下内容为原创,转载请注明出处。
1 import xlwt #创建Excel,见代码行8,9,11,25,28;CMD下:运行pip install xlwt进行安装 2 import urllib.request # url请求,Python3自带,Python2与3中urllib的区别见:http://blog.csdn.net/Jurbo/article/details/52313636 3 from bs4 import BeautifulSoup # 快速获取网页标签内容的库;CMD下:运行pip install beautifulsoup4进行安装 4 import re # 使用正则表达式的库,代码行7,快速学习见:http://www.runoob.com/regexp/regexp-syntax.html 5 poiTag = ["id","name","type","typecode","biz_type","address","location","tel","pname","cityname","adname"] #返回结果控制为base时,输出的POI标签类别 6 poiSoupTag = ["idSoup","nameSoup","typeSoup","typecodeSoup","biz_typeSoup","addressSoup","locationSoup","telSoup","pnameSoup","citynameSoup","adnameSoup"] #包装对应的Soup 7 pattern = re.compile("(?:>)(.*?)(?=<)",re.S) # 组织正则表达式 8 poiExcel =xlwt.Workbook() # 新建工作簿 9 sheet = poiExcel.add_sheet("poiResult") # 新建“poiResult”的工作表 10 for colIndex in range(len(poiTag)): 11 sheet.write(0,colIndex,poiTag[colIndex]) # 写表头 12 offset = 10 # 实例设置每页展示10条POI(官方限定25条) 13 maxPage = 10 # 设置最多页数为10页(官方限定100页) 14 types = "090000" # 示例类别为医疗保健服务POI,下载:http://a.amap.com/lbs/static/zip/AMap_poicode.zip 15 city = "440305" # 示例类别为深圳市南山区,下载:http://a.amap.com/lbs/static/zip/AMap_adcode_citycode.zip 16 for pageIndex in range(1, maxPage + 1): 17 try: 18 url = "http://restapi.amap.com/v3/place/text?&keywords=&types=" + types + "&city=" + city + "&citylimit=true&output=xml&offset=" + str(offset) + "&page="+ str(pageIndex) + "&key=你的key&extensions=base" 19 # 请求的结构化url地址如上;请使用自己的key,见:http://lbs.amap.com/api/webservice/guide/api/search/ 20 poiSoup = BeautifulSoup(urllib.request.urlopen(url).read(),"xml") #读入对应页码的页面 21 for tagIndex in range(len(poiTag)): 22 poiSoupTag[tagIndex] = poiSoup.findAll(poiTag[tagIndex]) # 根据Tag读对应页码的POI标签内容 23 for rowIndex in range(len(poiSoupTag[0])): 24 for colIndex in range(len(poiSoupTag)): 25 sheet.write(len(poiSoupTag[0]) * (pageIndex - 1) + rowIndex + 1, colIndex, re.findall(pattern,str(poiSoupTag[colIndex][rowIndex]))) 26 # 根据正则表达式提取内容,并在对应行与列写入 27 except Exception as e: 28 print(e) # 设置错误输出 29 poiExcel.save("E:/POI&" + types + "&" + city + ".xls") # 保存 30 print("Done!") # 结束
注:页面过大时,部分单元格有概率出现重写错误(猜测和原页面每页数据不完整有关),因为设置了报错,不影响运行。但会导致极小部分POI丢失。
Python——使用高德API获取POI(以深圳南山医疗保健服务POI为例)
标签:blog pes pip sea 提取 结构化 组织 pat index
原文地址:http://www.cnblogs.com/shadrach/p/7615815.html