标签:写入 tps select 定时 模块 提交 write 很多 col
通过F12,对页面进行检查,查看我们所需要爬取内容的相关代码
3.文本分析(可选):jieba分词、wordcloud可视化
# 导入相关模块 import requests from bs4 import BeautifulSoup import pandas as pd def getHtml(url): # 判断爬取 是否出错 try: # 使用get方式爬取页面,添加头部伪装浏览器 r = requests.get(url, headers={‘user-agent‘: ‘Mozilla/5.0‘}) r.raise_for_status() # 设置编码格式 r.encoding = r.apparent_encoding # 返回源码 return r.text except: return "页面爬取Error" def HotelList(text,hotellist): soup = BeautifulSoup(text, "html.parser") # 爬取酒店列表 hotel_list = soup.select("div#hotel_list>div") # 循环 for hotel in hotel_list: # 将可能出现错误的地方进行跳过 try: # 酒店名称 hotel_name = hotel.select("h2")[0].text # 酒店链接 href = hotel.select("h2>a")[0].attrs["href"] # 服务贫家 recommend = hotel.select("span.recommend")[0].text p = hotel.select("p.hotel_item_htladdress") # 酒店地址 dizhi = p[0].text # 最新预定时间 newtime = hotel.select("p.hotel_item_last_book")[0].text # 评分 hotel_score = hotel.find_all("span", "hotel_value")[0].text # 点评人数 people_number = hotel.select("span.hotel_judgement>span")[0].text # 酒店评分 level = hotel.find_all("span", "hotel_level")[0].text # 质量保证 ico_quality_gold = hotel.select("span.ico_quality_gold")[0].text # 所属地区 diqu = p[0].find_all( "a", attrs={"tracekey": "nhtllistroomclick"})[0].text # price = hotel.find_all("span", "J_price_lowList ")[0].text hotellist.append([hotel_name, href, recommend, dizhi, newtime, hotel_score, people_number, level, ico_quality_gold, diqu]) print([hotel_name, href, recommend, dizhi, newtime, hotel_score, people_number, level, ico_quality_gold, diqu]) except: "一个数据爬取出错" hotellist.append([hotel_name, href, recommend, dizhi, newtime, hotel_score, people_number, level, ico_quality_gold, diqu]) def savedata(hotellist): wri = pd.ExcelWriter("HotelList.xlsx") col = ["hotel_name", "href", "recommend", "dizhi", "newtime", "hotel_score", "people_number", "level", "ico_quality_gold", "diqu"] pf = pd.DataFrame(hotellist,columns=col) # 写入excel pf.to_excel(wri) wri.save() def main(): # 存放数据的数组 hotellist = [] text = getHtml( "https://hotels.ctrip.com/hotel/quanzhou406#ctm_ref=hod_hp_sb_lst") HotelList(text, hotellist) #打印结果信息 print(hotellist) # 数据保存 savedata(hotellist)
标签:写入 tps select 定时 模块 提交 write 很多 col
原文地址:https://www.cnblogs.com/wangjiaping/p/12076949.html