标签:对象 自动 imp 数组 mss port end url log
Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库。bs4 模块的 BeautifulSoup 配合requests库可以写简单的爬虫。
安装
解析器
安装命令:
requests
pandas
数据结构:
使用
Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种:
Tag:Tag 对象与XML或HTML原生文档中的tag相同,tag中最重要的属性: name和attributes
从网页中获取指定标签、属性值,取值方式:
功能标签
查找元素:
demo
import sys
import io
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from py_teldcore import sqlserver_db as db
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding=‘gb18030‘)
url = "http://www.tianqihoubao.com/lishi/hefei/month/201812.html"
def get_soap():
try:
r = requests.get(url)
soap = bs(r.text, "lxml")
return soap
except Exception as e:
print(e)
return "Request Error"
def save2cvs(data, path):
result_weather = pd.DataFrame(data, columns=[‘date‘, ‘tq‘, ‘temp‘, ‘wind‘])
result_weather.to_csv(path, encoding=‘gbk‘)
print(‘save weather sucess‘)
def save2mssql(data):
sql = "Insert into Weather(date, tq, temp, wind) values(%s, %s, %s, %s)"
data_list = np.ndarray.tolist(data)
# sqlvalues = list()
# for data in data_list:
# sqlvalues.append(tuple(data))
sqlvalues = [tuple(iq) for iq in data_list]
try:
db.exec_sqlmany(sql, sqlvalues)
except Exception as e:
print(e)
def get_data():
soap = get_soap()
print(soap)
all_weather = soap.find("div", class_="wdetail").find("table").find_all("tr")
data = list()
for tr in all_weather[1:]:
td_li = tr.find_all("td")
for td in td_li:
s = td.get_text()
data.append("".join(s.split()))
res = np.array(data).reshape(-1, 4)
return res
if __name__ == "__main__":
data = get_data()
save2mssql(data)
print("save2 Sqlserver ok!")
参考资料
标签:对象 自动 imp 数组 mss port end url log
原文地址:https://www.cnblogs.com/tgzhu/p/11385068.html