Python高级应用程序设计任务

时间：2019-12-14 21:27:00 阅读：104 评论：0 收藏：0 [点我收藏+]

标签：str 网页标题 bar requests int 出图 mil 列表

Python高级应用程序设计任务要求

用Python实现一个面向主题的网络爬虫程序，并完成以下内容：
（注：每人一题，主题内容自选，所有设计内容与源代码需提交到博客园平台）

一、主题式网络爬虫设计方案（15分）

1.主题式网络爬虫名称

　　爬取各地的天气信息
2.主题式网络爬虫爬取的内容与数据特征分析

　　爬取全国各地的天气信息并保存。

　　计算全国各省的pm2.5平均值，并绘成柱状图
3.主题式网络爬虫设计方案概述（包括实现思路与技术难点）

　　思路：本次设计方案主要依靠request库对目标页面进行信息的爬取采集，再用BeautifulSoup对数据进行清洗，最后将结果打印出来。

　　技术难点：对于数据的清洗还有各个省份之间的对应关系。

二、主题页面的结构特征分析（15分）
1.主题页面的结构特征

2.Htmls页面解析

这我们可以知道这个天气网查询的api是这个地址https://www.tianqi.com/+城市的链接。所以我们需要获取到所有城市的链接信息

3.节点（标签）查找方法与遍历方法
（必要时画出节点树结构）

从这个链接中我们可以得到所有城市的信息http://www.tianqi.com/chinacity.html

技术图片

从这我们可以看出所有的省份都是存在这里的我们只需要定位到<div class="citybox">标签然后拿到所有的<h2>标签即可

技术图片

我们可以看出所有的城市都是在<div class="citybox"> --> <span> --><a>；按照这个标签顺序我们就可以获取到所有的城市，接下来我们只要把各个省市绑定在字典上即可

技术图片

主页面的解析

从这里我们可以明确的看到各个天气信息所属的标签，很容易就可以遍历得出

技术图片

三、网络爬虫程序设计（60分）
爬虫程序主体要包括以下各部分，要附源代码及较详细注释，并在每部分程序后面提供输出结果的截图。

import requests
import time
import os
import re
import pandas as pd
from bs4 import BeautifulSoup as BS
import matplotlib.pyplot as plt
def getHtml(url):
    try:
        kv = {‘user-agent‘: ‘Mozilla/5.0‘}  #伪装下访问的标识不然会被拦截
        r = requests.get(url,headers=kv)
        r.raise_for_status()    #抛出访问失败的异常
        r.encoding = r.apparent_encoding    #设置编码格式防止乱码
        return r.text   #返回爬取到网页的全部代码
    except:
        print("爬取失败")
#获取温度
def getTemperature(html):
    soup = BS(html, "html.parser")
    a = soup.find("p",attrs="now")
    return a.text
#获取天气信息
def getWeather(html):
    soup = BS(html, "html.parser")
    b = soup.find("dd",attrs="kongqi")
    return b.text
#获取pm2.5
def getPm(html):
    soup = BS(html, "html.parser")
    b = soup.find("dd",attrs="kongqi")
    aa = b.h6.text
    pm = re.findall("\d+", aa)
    return int(pm[0])
#数据存储
def dataSave():
     try:
          #创建文件夹
         os.mkdir("C:\天气")
     except:
         #如果文件夹存在则什么也不做
         ""
#获取各个县市信息英文
def getCityHref(html):
    #创建2个空表用来存放城市的信息
    oldList = []
    cityHrefList = []
    #把获取到的内容包装成BS对象
    soup = BS(html, "html.parser")
    #提取包含城市的标签
    tab = soup.find("div",attrs="citybox")
    #获取到所有城市的标签
    a = tab.find_all("span")
    #遍历得到各个省市
    for d in range(len(a)):
        for x in a[d].find_all("a"):
            if x is not None:
                oldList.append(x.get("href"))
        #把得到得省包装在一个列表上，各个省之间分开
        cityHrefList.append(oldList)
        #清除列表
        oldList = []
    #返回一个包含所有省市得列表
    return cityHrefList
#获取各个省份信息中文
def getCityName(html):
    #创建2个空表用来存放省市的信息
    oldList = []
    cityNameList = []
    soup = BS(html, "html.parser")
    #提取包含省市的标签
    b = soup.find("div",attrs="citybox")
    #获取到所有省市的标签
    a = b.find_all("span")
    #遍历得到各个省市
    for d in range(len(a)):
        for x in a[d].find_all("a"):
            if x is not None:
                oldList.append(x.text)
        #print(list)
        #把得到得省包装在一个列表上，各个省之间分开
        cityNameList.append(oldList)
        #清除列表
        oldList = []
    #返回一个包含所有省市得列表
    return cityNameList
#获取各个省份信息
def getProvince(html):
    ProvinceList = []
    soup = BS(html, "html.parser")
    b = soup.find("div", attrs="citybox")
    for x in b.find_all("h2"):
        ProvinceList.append(x.text)
    return ProvinceList
#获取所有地方的天气
def getCityWeather(dic,dic2):
    nowTime = time.strftime("%Y-%m-%d %H:%M", time.localtime())
    for into in cityProvinceList:
        # 数据保存
        dataSave()
        try:
            # 创建文件用于存储爬取到的数据
            with open("C:\\天气\\各地天气信息.txt", "a") as f:
                f.write("当前时间为{}以下为{}省的天气\n".format(nowTime,into))
        except:
            "存储失败"
        print("正在存储{}省的天气".format(into))
        for x in dic.get(into):
            url = "https://www.tianqi.com/" + x
            html = getHtml(url)
            try:
                # 创建文件用于存储爬取到的数据
                with open("C:\\天气\\各地天气信息.txt", "a") as f:
                        f.write("{}:{},{}\n".format(dic2.get(x),getTemperature(html),getWeather(html)))
            except:
                "存储失败"
        print("存储成功")
#获取各省的平均pm2.5
def getCityPM(cityProvinceList,dic):
    pmList = [] #存放各省的平均pm2.5
    pm = 0      #pm2.5累加
    num = 0     #计数
    start_time1 = time.time()
    for into in cityProvinceList[0:6]:
        start_time = time.time()  # 记录当前时间
        print("以下为" + into + "省pm2.5的平均值")
        for x in dic.get(into):
            url = "https://www.tianqi.com/" + x
            html = getHtml(url)
            pm = pm + getPm(html)
            num += 1
            avePm = round(pm/num)
        # pmList.append([into,avePm])
        pmList.append(avePm)
        elapse_time = time.time() - start_time
        print("计算{}的pm2.5平均值成功，共有{}个城市，平均值为{}，耗时{:.2f}s".format(into,num,avePm,elapse_time))
        num = 0 #计数归0一下
    elapse_time = time.time() - start_time1
    print("总耗时{:.2f}分".format(elapse_time/60))
    return pmList
if __name__ == "__main__":
    html = getHtml("http://www.tianqi.com/chinacity.html")
    cityHreflist = getCityHref(html) #获取市链接
    cityProvinceList = getProvince(html) #获取省
    cityName = getCityName(html) #获取市中文
    #把各个省份按照字典得形式绑定起来,省市
    dic = {}
    #市英语转汉字
    dic2 = {}
    for d in range(len(cityHreflist)):
        dic.update({cityProvinceList[d]:cityHreflist[d]})
    for aa in range(len(cityName)):
        for xx in range(len(cityHreflist[aa])):
            dic2.update({cityHreflist[aa][xx]:cityName[aa][xx]})
    #获取各地的信息
    #getCityWeather(dic,dic2)
    #获取各省pm2.5的平均值返回列表
    pmList = getCityPM(cityProvinceList, dic)
    plt.rcParams[‘font.sans-serif‘] = [‘SimHei‘]  # 用来正常显示中文标签
    plt.rcParams[‘axes.unicode_minus‘] = False  # 用来正常显示负号
    s = pd.Series(pmList,cityProvinceList[0:6])
    # 设置图表标题
    s.plot(kind=‘bar‘, title=‘6个省份pm2.5平均值对比‘)
    # 输出图片
    plt.show()
    # into = input("请输入你要查询得省份")
    # for x in dic.get(into):
    #     url = "https://www.tianqi.com/" + x
    #     html = getHtml(url)
    #     print(dic2.get(x) + ":" + getTemperature(html), end="")
    #     print(getWeather(html))
    #     print("--------------")

1.数据爬取与采集

2.对数据进行清洗和处理

def getCityWeather(dic,dic2):
    nowTime = time.strftime("%Y-%m-%d %H:%M", time.localtime())
    for into in cityProvinceList:
        print("当前时间为{}以下为{}省的天气".format(nowTime,into))
        for x in dic.get(into):
            url = "https://www.tianqi.com/" + x
            html = getHtml(url)
            print(dic2.get(x) + ":" + getTemperature(html), end="")
            print(getWeather(html))
        print("--------------")

def getCityPM(cityProvinceList,dic):
    pmList = [] #存放各省的平均pm2.5
    pm = 0      #pm2.5累加
    num = 0     #计数
    for into in cityProvinceList:
        print("以下为" + into + "省pm2.5的平均值")
        for x in dic.get(into):
            start_time = time.time()    #记录当前时间
            url = "https://www.tianqi.com/" + x
            html = getHtml(url)
            pm = pm + getPm(html)
            num += 1
            avePm = round(pm/num)
        pmList.append([into,avePm])
        elapse_time = time.time() - start_time
        print("计算{}的pm2.5平均值成功，共有{}个城市，平均值为{}，耗时{}".format(into,num,avePm,elapse_time))
        num = 0 #计数归0一下
    return pmList

技术图片

3.文本分析（可选）：jieba分词、wordcloud可视化
4.数据分析与可视化
（例如：数据柱形图、直方图、散点图、盒图、分布图、数据回归分析等）

#获取各省的平均pm2.5
def getCityPM(cityProvinceList,dic):
    pmList = [] #存放各省的平均pm2.5
    pm = 0      #pm2.5累加
    num = 0     #计数
    start_time1 = time.time()
    for into in cityProvinceList[0:6]:
        start_time = time.time()  # 记录当前时间
        print("以下为" + into + "省pm2.5的平均值")
        for x in dic.get(into):
            url = "https://www.tianqi.com/" + x
            html = getHtml(url)
            pm = pm + getPm(html)
            num += 1
            avePm = round(pm/num)
        # pmList.append([into,avePm])
        pmList.append(avePm)
        elapse_time = time.time() - start_time
        print("计算{}的pm2.5平均值成功，共有{}个城市，平均值为{}，耗时{:.2f}s".format(into,num,avePm,elapse_time))
        num = 0 #计数归0一下
    elapse_time = time.time() - start_time1
    print("总耗时{:.2f}分".format(elapse_time/60))
    return pmList
if __name__ == "__main__":
    html = getHtml("http://www.tianqi.com/chinacity.html")
    cityHreflist = getCityHref(html) #获取市链接
    cityProvinceList = getProvince(html) #获取省
    cityName = getCityName(html) #获取市中文
    #把各个省份按照字典得形式绑定起来,省市
    dic = {}
    #市英语转汉字
    dic2 = {}
    for d in range(len(cityHreflist)):
        dic.update({cityProvinceList[d]:cityHreflist[d]})
    for aa in range(len(cityName)):
        for xx in range(len(cityHreflist[aa])):
            dic2.update({cityHreflist[aa][xx]:cityName[aa][xx]})
    #获取各地的信息
    #getCityWeather(dic,dic2)
    #获取各省pm2.5的平均值返回列表
    pmList = getCityPM(cityProvinceList, dic)
    plt.rcParams[‘font.sans-serif‘] = [‘SimHei‘]  # 用来正常显示中文标签
    plt.rcParams[‘axes.unicode_minus‘] = False  # 用来正常显示负号
    s = pd.Series(pmList,cityProvinceList[0:6])
    # 设置图表标题
    s.plot(kind=‘bar‘, title=‘6个省份pm2.5平均值对比‘)
    # 输出图片
    plt.show()

5.数据持久化

数据的存储

#获取所有地方的天气
def getCityWeather(dic,dic2):
    nowTime = time.strftime("%Y-%m-%d %H:%M", time.localtime())
    for into in cityProvinceList:
        # 数据保存
        dataSave()
        try:
            # 创建文件用于存储爬取到的数据
            with open("C:\\天气\\各地天气信息.txt", "a") as f:
                f.write("当前时间为{}以下为{}省的天气\n".format(nowTime,into))
        except:
            "存储失败"
        print("正在存储{}省的天气".format(into))
        for x in dic.get(into):
            url = "https://www.tianqi.com/" + x
            html = getHtml(url)
            try:
                # 创建文件用于存储爬取到的数据
                with open("C:\\天气\\各地天气信息.txt", "a") as f:
                        f.write("{}:{},{}\n".format(dic2.get(x),getTemperature(html),getWeather(html)))
            except:
                "存储失败"
        print("存储成功")