标签:art next ast 转换 details 时间 code ams col
编译在线环境:
1、使用Python爬虫爬取豆瓣网某一部电影的评论信息;
2、从评论信息中统计各级星评的数量占比
1 from urllib import request 2 from bs4 import BeautifulSoup 3 import matplotlib as mpl 4 import matplotlib.pyplot as plt 5 import pandas as pd 6 import numpy as np 7 import requests 8 import re 9 10 """ 11 生成分页网址 12 n:需要生成的页数 13 """ 14 def get_urls(n): 15 urls=[] 16 for i in range(0,n): 17 u=‘https://movie.douban.com/subject/30166972/comments?start=‘+str(i*20)+‘&limit=20&sort=new_score&status=P‘ 18 urls.append(u) 19 return urls 20 #get_urls(5) 21 22 """ 23 【从分页采集数据】 24 u:输入分页网址 25 """ 26 def get_informations(u): 27 headers = { 28 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘} 29 r = requests.get(u, headers=headers) 30 soup = BeautifulSoup(r.text, ‘lxml‘) 31 ims = soup.find_all(‘div‘, class_=‘comment-item‘) 32 33 # print(ims) 34 data = [] 35 36 for i in ims: 37 dic = {} 38 dic[‘用户名‘] = i.find(‘span‘, class_=‘comment-info‘).find(‘a‘, class_="").text 39 mystar = i.find(‘span‘, class_=‘comment-info‘).find(‘span‘).next_sibling.next_sibling 40 mystar = str(mystar) 41 t = re.findall(r‘\d+‘, mystar) 42 dic[‘所评星级‘] = str(t[0]) 43 dic[‘评论‘] = i.find(‘span‘, class_=‘short‘).text 44 dic[‘评论获赞数‘] = i.find(‘span‘, class_=‘votes‘).text 45 dic[‘评论时间‘] = i.find(‘span‘, class_=‘comment-time‘).text.strip() 46 data.append(dic) 47 return data 48 #get_informations(‘https://movie.douban.com/subject/30166972/comments?start=0&limit=20&sort=new_score&status=P‘) 49 50 """ 51 【从每一页采取所有数据】 52 n:需要采集的页数 53 """ 54 def get_alldata(n): 55 alldata = [] 56 for u in get_urls(n): 57 alldata.extend(get_informations(u)) 58 """ 59 清洗数据 60 转换数据类型以及星级去除% 61 .loc就是索引,逗号前面是行索引,逗号后面是列索引 62 """ 63 df.loc[:, ‘所评星级‘] = df[‘所评星级‘].astype(‘float‘) 64 df.loc[:, ‘评论获赞数‘] = df[‘评论获赞数‘].astype(‘int‘) 65 66 return pd.DataFrame(alldata) 67 68 """ 69 内容1、爬取豆瓣网某一部电影的评论信息 70 """ 71 #保存数据到《少年的你》影评.csv 72 df = get_alldata(5) 73 df.to_csv(‘《少年的你》影评.csv‘,index=False,encoding=‘utf-8‘) 74 print(df) 75 76 """pandas绘图:直方图,bins是像素""" 77 df[‘星级‘].hist(bins=20)
#str.strip() s1=‘\nabcdefg\n‘ print(s1.strip())
1 字符窜:DataFrame数据框里边的name列为字符窜形式 2 3 清除字符窜左侧是空值: 4 5 newname=df[‘name‘].str.lstrip() 6 7 8 删除右侧: 9 newname=df[‘name‘].str.rstrip() 10 11 12 删除全部: 13 newname=df[‘name‘].str.strip()
https://www.cnblogs.com/rexyan/p/7975707.html
https://www.jianshu.com/p/7332afd70797
next_sibling “”“ <p class=‘comment-time‘>ads<\p> <p class=‘comment-time‘>bdg<\p> <p class=‘comment-time‘>fgh<\p > 可以通过next_sibling ”“” Tsecond = i.find(‘p‘,class_=‘comment-time‘).next_sibling.next_sibling
TsecondText = i.find(‘p‘,class_=‘comment-time‘).next_sibling.next_sibling.text
import matplotlib as mpl import matplotlib.pyplot as plt mpl.rcParams[‘font.sans-serif‘] = [‘KaiTi‘] mpl.rcParams[‘font.serif‘] = [‘KaiTi‘]
标签:art next ast 转换 details 时间 code ams col
原文地址:https://www.cnblogs.com/Aiahtwo/p/12339872.html