码迷,mamicode.com
首页 > 其他好文 > 详细

2月16日学习记录

时间:2020-02-16 23:28:48      阅读:127      评论:0      收藏:0      [点我收藏+]

标签:pre   except   频率   知识   图片   ace   drop   参数   单词   

1,背诵单词:vice 邪恶;恶习 drop滴;落下;微量  otherwise 另样,用别的方法 bind捆,绑,包括,束缚   eligible 符合条件的 narrative 叙述性的 叙述 tile 瓦片,瓷砖bundle 捆,包,束 mill 磨粉机,磨坊 heave (用力)举,提 gay 快乐的,愉快的 statistical统计的,统计学的 fence 篱笆;围栏;剑术 magnify 放大,扩大 graceful优美的,文雅的,大方的 analyse 分析,分解   artificial 人工的,人造的 privacy 独处,自由,隐私;私生活 tub 木盆,澡盆 feedback 反馈;反应;回授 property 财产,资产upper 上面的;上部的

2,完善了北京信件统计系统的一些功能,爬取了百度信息领域热词并存入mysql并简单实现了词云图功能:

  1,爬取百度热词的URL,存入citiao_list.txt中

import requests
import json
import re
import os
import traceback
from lxml import html
from lxml import etree
from _multiprocessing import send
from _overlapped import NULL
‘‘‘
获取词条的url链接到citiao_list.txt文件
‘‘‘
def getUrlText(url,page):
    try:
        access={"limit": "24",
                "timeout": "3000",
                "filterTags": "%5B%5D",
                "tagId": "76607",
                "fromLemma": "false",
                "contentLength": "40",
                "page": page}
        header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"}
        r=requests.post(url,data=access,headers=header)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        print("连接成功")
        return r.text
    except:
        return "连接失败"
    
def getJson(html,fpath):
    date=json.loads(html)
    if date and lemmaList in date.keys():
        lemmaList= date.get(lemmaList)
        for item in lemmaList:
            name = item.get(lemmaTitle)   #词条名称
            name=re.sub(r"\s+", "", name)
            Desc=item.get(lemmaDesc)     #词条简介
            Desc=re.sub(r"\s+", "", Desc)
            url = item.get(lemmaUrl)     #词条链接
            url=re.sub(r"\s+", "",url)
            #citiao=name+"&&"+Desc+"&&"+url+"\n"
            #citiao=name+"&&"+Desc+"\n"
            citiao=url+",\n"
            #citiao=Desc+"---------------"
            print(citiao)
            save__file(fpath,citiao)
            
 #创建文件
#file_path:文件路径
#msg:即要写入的内容
def save__file(file_path,msg):
    f=open(file_path,"a",encoding=utf-8)
    f.write(msg)
    f.close         
    

def main():
    url="https://baike.baidu.com/wikitag/api/getlemmas"
    #pafph="citiao.txt"
    #pfaph="dancitiao.txt"
    #pfaph="fenci.txt"
    fpath="citiao_list.txt"
    page=84
    if(os.path.exists(fpath)):
        os.remove(fpath)
    else:
        for page in range(0,501):
            html= getUrlText(url,page)
            getJson(html,fpath)
            paget=page+1
            print("第%s页"%paget)
    
main()

 

  2,通过citiao_list.txt文件中的URL爬取词条名称和简介并存入citiao.txt中

import requests;
import re
import os
import traceback
from lxml import html
from lxml import etree
from _multiprocessing import send
from _overlapped import NULL

#获取HTML内容
def getHTMLText(url):
    access={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"}
    try:
        r=requests.get(url,headers=access)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return "无法连接"

def get_citiao_massage(lst,url,fpath):
    #url="http://baike.baidu.com/item/COUNTA/7669327"
    print(url)
    citiao_html=getHTMLText(url)
    soup=etree.HTML(citiao_html)
    #词条名称dd class="lemmaWgt-lemmaTitle-title"  h1 text()
    name1=soup.xpath("//dd[@class=‘lemmaWgt-lemmaTitle-title‘]/h1/text()")
    name2=soup.xpath("//dd[@class=‘lemmaWgt-lemmaTitle-title‘]/h2/text()")
    name1="".join(name1)
    if "".join(name2)=="":
        name2=""
    else:
        name2="".join(name2)
    name=name1+name2
    #词条详情div class="lemma-summary"  text()
    desc=soup.xpath("//div[@class=‘lemma-summary‘]/div/text()")
    desc="".join(desc)
    desc=re.sub(r"\s+", "", desc)
    url=re.sub(r"\s+", "", url)
    citiao=name+"&&"+desc+"&&"+url+"\n"
    print(citiao)
    save__file(fpath,citiao)
    return citiao
    
#创建文件
#file_path:文件路径
#msg:即要写入的内容
def save__file(file_path,msg):
    f=open(file_path,"a",encoding=utf-8)
    f.write(msg)
    f.close
        
def Run(out_put_file,fpath):
    urls=""
    lsts=[]
    lst=[]
    cond=0
    with open(out_put_file,"r") as f:
        urls=f.read()
    lsts=urls.split(",")
    for i in lsts:
        citiao=""
        citiao=get_citiao_massage(lst,i,fpath) 
        if citiao=="":
            continue
        lst.append(citiao)
        cond+=1
        print("\n当前速度:{:.2f}%".format(cond*100/len(lsts)),end="") 
    return lst
    
#主函数
def main():
    fpath="citiao.txt"
    out_put_file="citiao_list.txt"
    if(os.path.exists(fpath)):
        os.remove(fpath)
    else:
        lst=[]
        lst=Run(out_put_file, fpath)
        for i in lst:
            print(i+"aaa")
    
#程序入口
main()

  3,对citiao.txt中内容进行分词,根据每个词的出现频率展示词云图

import requests;
import re
import os
import traceback
from lxml import html
from lxml import etree
from _multiprocessing import send
from _overlapped import NULL

#获取HTML内容
def getHTMLText(url):
    access={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"}
    try:
        r=requests.get(url,headers=access)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return "无法连接"

def get_citiao_massage(lst,url,fpath):
    #url="http://baike.baidu.com/item/COUNTA/7669327"
    print(url)
    citiao_html=getHTMLText(url)
    soup=etree.HTML(citiao_html)
    #词条名称dd class="lemmaWgt-lemmaTitle-title"  h1 text()
    name1=soup.xpath("//dd[@class=‘lemmaWgt-lemmaTitle-title‘]/h1/text()")
    name2=soup.xpath("//dd[@class=‘lemmaWgt-lemmaTitle-title‘]/h2/text()")
    name1="".join(name1)
    if "".join(name2)=="":
        name2=""
    else:
        name2="".join(name2)
    name=name1+name2
    #词条详情div class="lemma-summary"  text()
    desc=soup.xpath("//div[@class=‘lemma-summary‘]/div/text()")
    desc="".join(desc)
    desc=re.sub(r"\s+", "", desc)
    url=re.sub(r"\s+", "", url)
    citiao=name+"&&"+desc+"&&"+url+"\n"
    print(citiao)
    save__file(fpath,citiao)
    return citiao
    
#创建文件
#file_path:文件路径
#msg:即要写入的内容
def save__file(file_path,msg):
    f=open(file_path,"a",encoding=utf-8)
    f.write(msg)
    f.close
        
def Run(out_put_file,fpath):
    urls=""
    lsts=[]
    lst=[]
    cond=0
    with open(out_put_file,"r") as f:
        urls=f.read()
    lsts=urls.split(",")
    for i in lsts:
        citiao=""
        citiao=get_citiao_massage(lst,i,fpath) 
        if citiao=="":
            continue
        lst.append(citiao)
        cond+=1
        print("\n当前速度:{:.2f}%".format(cond*100/len(lsts)),end="") 
    return lst
    
#主函数
def main():
    fpath="citiao.txt"
    out_put_file="citiao_list.txt"
    if(os.path.exists(fpath)):
        os.remove(fpath)
    else:
        lst=[]
        lst=Run(out_put_file, fpath)
        for i in lst:
            print(i+"aaa")
    
#程序入口
main()

 

 

  技术图片

 

 3,遇到的问题:

  1,爬取百度百科词条URL时发现百度百科的词条是通过Ajax传的json数据在前端渲染展示的,而且是通过post方式传的数据,因此网上查找,通过requests库的post方法,将参数设置好就能访问到json数据了,然后通过json库的loads方法将json数据转为list类型,从而取得每个词条的URL链接

  2,生成词云图的代码我看的不是太懂

  3,关于对词条分类我初步猜想可以通过对特定的分类根据词条的简介进行模糊查询进行分类

  4,对词条与词条之间的关系图不知道怎么实现

4,明天就上课了,希望在新的学期能学到很多知识

 

2月16日学习记录

标签:pre   except   频率   知识   图片   ace   drop   参数   单词   

原文地址:https://www.cnblogs.com/lq13035130506/p/12319392.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!