标签:
import pymongo client = pymongo.MongoClient(‘localhost‘, 27017) # MongoDB 客户端 walden = client[‘walden‘] # 数据库中创建的名称 sheet_tab = walden[‘sheet_tab‘] # 创建Table # 演示代码1 # path = ‘/Users/qiongyanzhu/Documents/Plan-for-combating-master/week2/2_1/2_1code_of_video/walden.txt‘ # with open(path, ‘r‘) as f: # lines = f.readlines() # for index, line in enumerate(lines): # data = { # ‘index‘: index, # ‘line‘: line, # ‘words‘: len(line.split()) # } # print(data) # sheet_tab.insert_one(data) # 演示代码2 # for item in sheet_tab.find({‘words‘: 0}): # print(item) # 演示代码3 # $lt/$lte/$gt/$gte/$ne for item in sheet_tab.find({‘words‘: {‘$lt‘: 5}}): print(item) # 演示代码4 for item in sheet_tab.find(): print(item[‘line‘])
from bs4 import BeautifulSoup import requests import pymongo client = pymongo.MongoClient(‘localhost‘, 27017) xiaozhu = client[‘xiaozhu‘] sheet_tab = xiaozhu[‘sheet_tab‘] url_as = [‘http://bj.xiaozhu.com/search-duanzufang-p{}-0/‘.format(str(number)) for number in range(1, 4)] def insert_house_info(url_s): for url_a in url_s: # 获取页面数据 wb_data = requests.get(url_a) # 采用lxml解析引擎,解析数据 soup = BeautifulSoup(wb_data.text, ‘lxml‘) prices = soup.select(‘span.result_price‘) titles = soup.select(‘#page_list > ul > li > div.result_btm_con.lodgeunitname > div > a > span‘) urls = soup.select(‘#page_list > ul > li > div.result_btm_con.lodgeunitname‘) for price, title, url in zip(prices, titles, urls): info = { ‘price‘: int(price.get_text()[1:len(price.get_text())-2]), ‘title‘: title.get_text(), ‘url‘: url.get(‘detailurl‘) } # print(info) sheet_tab.insert_one(info) def find_house(): for info in sheet_tab.find({‘price‘: {‘$gt‘: 500}}): print(info) insert_house_info(url_as) find_house()
标签:
原文地址:http://www.cnblogs.com/mspeer/p/5634232.html