标签:als 关注 pandas [] onclick except += strip unit
1.数据库:mysql
2.库:requests、BeautifulSoup、pandas、smtplib、mysql.connector
db.py:
config = { ‘host‘:‘127.0.0.1‘, ‘user‘:‘****‘, ‘passwd‘:‘****‘, ‘database‘:‘houseinfo‘ }
url.json
{ "url": "https://bj.*****.com/ershoufang", "urls": [ "/bp0ep100000ba0ea20000l3c1111027378232/?sug=%E5%88%A9%E6%B3%BD%E8%A5%BF%E5%9B%AD&noStatic=1", "/bp0ep100000ba0ea20000l3c1111027378505/?sug=%E5%8D%97%E6%B9%96%E4%B8%9C%E5%9B%AD%E4%B8%80%E5%8C%BA" ] }
主程序:
1 #!/usr/bin/python 2 # coding=utf-8 3 4 import json 5 import time 6 import requests 7 from bs4 import BeautifulSoup 8 import mysql.connector 9 import smtplib 10 from email.mime.text import MIMEText 11 from email.header import Header 12 import mysql 13 import pandas as pd 14 import pachong.db 15 import unittest 16 17 # 从网络上获取数据,存入数据库 18 def savehouseinfo(): 19 # 从文件中获取url 20 with open(‘url.json‘, ‘r‘) as f: 21 data = json.load(f) 22 # 获取url 23 url1 = data[‘url‘] 24 # 获取小区对应URl 25 url_2 = data[‘urls‘] 26 27 # 连接数据库 28 config = pachong.db.config 29 mydb = mysql.connector.connect(**config) 30 mycursor = mydb.cursor() 31 32 # 清空表中数据 33 sql = "delete from houseInfo;" 34 mycursor.execute(sql) 35 36 # 访问url,获取想要数据 37 for u in url_2: 38 url = url1 + u 39 print(url) 40 headers = {‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36‘ 41 ‘ (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36‘} 42 43 source = requests.get(url, headers=headers).text.replace(‘<span class="divide">/</span>‘, ‘;‘) 44 .replace(‘</a>‘, ‘;</a>‘) 45 source = source.replace(‘<span>‘, ‘<span>;‘).replace(‘</span>‘, ‘;</span>‘) 46 soup = BeautifulSoup(source, ‘html.parser‘) 47 # print(soup) 48 # print(soup) 49 # 获取房型对应的信息 50 divs = soup.find_all(attrs={‘class‘: ‘info clear‘}) 51 52 # 将数据存入数据库 53 # print(divs) 54 for i in divs: 55 # print(i) 56 57 # soup = BeautifulSoup(i.tostring(), ‘html.parser‘) 58 # texts = i.get_text().replace(‘随时看房;‘, ‘‘).replace(‘新上;‘, ‘‘).replace(‘VR房源;‘, ‘‘).replace(‘房主自荐;‘, ‘‘) 59 # print(texts) 60 # houseInfos = texts.split(‘;‘) 61 # 房型名称 62 title = i.find(class_=‘title‘).a.get_text().strip(‘;‘) 63 # title = houseInfos[0] 64 # # 小区名称 65 houseInfo = i.find(class_=‘houseInfo‘).get_text().split(‘;‘) 66 partName = houseInfo[0].strip(‘;‘) 67 # partName = houseInfos[1] 68 # # 房型 69 ttype = houseInfo[2].strip(‘;‘) 70 # ttype = houseInfos[3] 71 # # 面积 72 area = houseInfo[3].strip(‘;‘) 73 # area = houseInfos[4] 74 # # 朝向 75 orientations = houseInfo[4].strip(‘;‘) 76 # orientations = houseInfos[5] 77 # # 装修 78 decorate = houseInfo[5].strip(‘;‘) 79 # decorate = houseInfos[6] 80 # # 电梯楼层 81 elevator = houseInfo[6].strip(‘;‘) 82 positionInfo = i.find(class_=‘positionInfo‘).get_text().split(‘;‘) 83 floor = elevator + positionInfo[0].strip(‘;‘) 84 # floor = houseInfos[7] 85 # # 建房年份 86 buildYear = positionInfo[1].strip(‘;‘) 87 # buildYear = houseInfos[8] 88 # # 地区 89 address = i.find(class_=‘positionInfo‘).a.get_text().strip(‘;‘) 90 # address = houseInfos[9] 91 # # 关注人数 92 followInfos = i.find(class_=‘followInfo‘).get_text(‘/‘).split(‘/‘) 93 care = followInfos[0] 94 # care = houseInfos[10] 95 # # 地铁 96 subway = i.find(class_=‘subway‘).get_text().strip(‘;‘) 97 # subway = houseInfos[11] 98 # # 房本 99 if(i.find(class_=‘five‘)): 100 book = i.find(class_=‘five‘).get_text().strip(‘;‘) 101 elif(i.find(class_=‘taxfree‘)): 102 book = i.find(class_=‘taxfree‘).get_text().strip(‘;‘) 103 else: 104 book = ‘‘ 105 # book = houseInfos[12] 106 # # 总价 107 total = i.find(class_=‘totalPrice‘).span.get_text().strip(‘;‘)+‘万‘ 108 # total = houseInfos[14] 109 # # 单价 110 price = i.find(class_=‘unitPrice‘).span.get_text().strip(‘;‘) 111 # price = houseInfos[16] 112 # print(price) 113 mycursor = mydb.cursor() 114 # 插入数据 115 sql = "INSERT INTO houseInfo (title, partName, ttype, area, orientations, " 116 "decorate, floor, buildYear,address, care, subway, book, total, price) " 117 "VALUES (%s, %s, %s, %s," 118 " %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);" 119 val = (title, partName, ttype, area, orientations, decorate, floor, buildYear, 120 address, care, subway, book, total, price) 121 mycursor.execute(sql, val) 122 mydb.commit() # 数据表内容有更新,必须使用到该语句 123 mycursor.close() 124 mydb.close 125 126 # 从数据库中获取数据 127 def gethouseinfo(): 128 config = pachong.db.config 129 mydb = mysql.connector.connect(**config) 130 mycursor = mydb.cursor() 131 sql = ‘‘‘select id,partName,title,ttype, area, total, price, orientations, decorate, floor, buildYear, 132 address, care, subway, book from houseInfo;‘‘‘ 133 ids = [] 134 partnames = [] 135 names = [] 136 ttypes = [] 137 areas = [] 138 totals = [] 139 prices =[] 140 orientationses = [] 141 decorates = [] 142 floors = [] 143 buildYears = [] 144 addresses = [] 145 cares = [] 146 subways = [] 147 books = [] 148 try: 149 mycursor.execute(sql) 150 table = mycursor.fetchall() 151 for t in table: 152 ids.append(t[0]) 153 partnames.append(t[1]) 154 names.append(t[2]) 155 ttypes.append(t[3]) 156 areas.append(t[4]) 157 totals.append(t[5]) 158 prices.append(t[6]) 159 orientationses.append(t[7]) 160 decorates.append(t[8]) 161 floors.append(t[9]) 162 buildYears.append(t[10]) 163 addresses.append(t[11]) 164 cares.append(t[12]) 165 subways.append(t[13]) 166 books.append(t[14]) 167 results = (ids, partnames, names, ttypes, areas, totals, books, 168 prices, orientationses, decorates, floors, buildYears, subways, addresses, cares) 169 return results 170 except: 171 print("Error:unable to facth data") 172 mydb.close() 173 174 def highlight(): 175 return [‘background-color: #FF0000‘] 176 177 # 将从数据库中拿到的数据转成html 178 def convert_to_html(): 179 result = gethouseinfo() 180 titles = [‘编号‘, ‘小区名称‘, ‘出售房名称‘, ‘房型‘, ‘面积‘, ‘总价‘, ‘房本‘, 181 ‘单价‘, ‘朝向‘, ‘装修‘, ‘楼层‘, ‘建造年份‘, ‘地铁‘, ‘地址‘, ‘关注度‘] 182 d = {} 183 index = 0 184 for r in result: 185 t = titles[index] 186 d[t] = r 187 index += 1 188 # print(d) 189 df = pd.DataFrame(d, columns=titles) 190 pd.set_option(‘max_colwidth‘, 500) 191 # print(df) 192 h = df.to_html() 193 df.style.apply(highlight) 194 # print(h) 195 return h 196 197 # 发送邮件 198 def sendmail(): 199 # 第三方 SMTP 服务 200 mail_host = "smtp.163.com" # 设置服务器 201 mail_user = "******" # 用户名 202 mail_pass = "*****" # 口令 203 204 sender = ‘****@163.com‘ 205 receivers = [‘****@163.com‘] # 接收邮件,可设置为你的QQ邮箱或者其他邮箱 206 207 h = convert_to_html() 208 message = MIMEText(h, _subtype=‘html‘, _charset=‘utf-8‘) 209 message[‘From‘] = Header("***", ‘utf-8‘) 210 message[‘To‘] = Header(‘***‘, ‘utf-8‘) 211 212 t = time.strftime(‘%Y-%m-%d %H:%M:%S‘, time.localtime()) 213 subject = ‘二手房_‘+t 214 message[‘Subject‘] = Header(subject, ‘utf-8‘) 215 216 217 try: 218 smtpObj = smtplib.SMTP() 219 smtpObj.connect(mail_host, 25) # 25 为 SMTP 端口号 220 smtpObj.login(mail_user, mail_pass) 221 smtpObj.sendmail(sender, receivers, message.as_string()) 222 print("邮件发送成功") 223 except smtplib.SMTPException as e: 224 print(e)
标签:als 关注 pandas [] onclick except += strip unit
原文地址:https://www.cnblogs.com/qixiafeng/p/10758582.html