标签:express regex pattern code bre cap 关闭 爬取 rev
五一小长假要到了,甄开心,肯定比写博客要开心多了,哈哈哈哈
我还在犹豫要不要写爬虫这篇,因为网上已经有大量爬虫相关资源,爬虫也不是以研究为主,而是一个获取数据的手段。
书写目的:
装包提示:
Use Requests and regular expression
正则表达式-重要的事情说两遍
正则表达式的练习在线网址: https://regexone.com/
二话不说上代码,晕~
import re
import requests
import matplotlib.pyplot as plt
url = "https://movie.douban.com/review/best/"
total1 = requests.get(url).text
pattern1 = re.compile("https://movie.douban.com/review/\d+/")
review_url_list = pattern1.findall(total1)
review_url_list = list(set(review_url_list))#remove duplicates
import pandas as pd
review = []
for url in review_url_list:
total = requests.get(url).text
pattern2 = re.compile("<p>.*</p>")
review0 = str(pattern2.findall(total))
review.append(review0)
DF1 = pd.DataFrame({"影评": review, "网址": review_url_list})
DF1#可以看到影评中还有许多源码需要去除
import re
import requests
import matplotlib.pyplot as plt
url = "https://movie.douban.com/"
total = requests.get(url).text
pattern1 = re.compile("https://movie.douban.com/subject/\d+/\?from=showing")
pattern2 = re.compile("https://img3.doubanio.com/view/photo/s_ratio_poster/public/p\d+.jpg")
pattern2.findall(total)[:5]
有了图片地址,就意味着我们可以得到图片了
Use BeautifulSoup
import urllib.request
import urllib
import os
from bs4 import BeautifulSoup
import string
table=str.maketrans({key:None for key in string.punctuation})#防止文件命名中出现标点符号,移除 ? / , .
def getAllImageLink():
for i in range(0,2):#可以设为100页或更高
if i==0:
url="http://www.dbmeinv.com"
else:
url="https://www.dbmeinv.com/?pager_offset="+str(i+1)#自己寻找翻页后网址变化规律
html = urllib.request .urlopen(url).read()
soup = BeautifulSoup(html)
liResult = soup.findAll('li',attrs={"class":"span3"})
for li in liResult:
imageEntityArray = li.findAll('img')
for image in imageEntityArray:
link = image.get('src')
imageName = image.get('title')
imageName=imageName.translate(table)
filesavepath = "%s.png" % imageName
urllib.request.urlretrieve(link,filesavepath)
getAllImageLink()
大boss之一
import requests
import re
from bs4 import BeautifulSoup
import collections
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36'
}
def craw(url):
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
return soup
soup = craw('https://baike.baidu.com/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%81/408485')
lines = soup.findAll('table')[4].findAll('a')[:-1]
stations_connection = collections.defaultdict(list)
dist = collections.defaultdict(int)
node_in_line = collections.defaultdict(set)
pattern = re.compile('([\w|\d]+)相邻站间距信息统计表')
for line in lines[:-3]:
link = 'https://baike.baidu.com' + line.get('href')
soup = craw(link)
for caption in soup.find_all('caption'):
line_name = re.findall(pattern, caption.get_text())
if line_name:
print('\n----The information of {} is following...'.format(line_name[0]))
table = caption.find_parent('table')
for neigbor in table.find_all('tr')[1:]:
start, end = re.findall(re.compile('([\w|\d]+)——([\w|\d]+)'), neigbor.th.text)[0]
distance = re.findall(re.compile('([\d]+)米*'), neigbor.td.text)[0]
stations_connection[start].append(end)
stations_connection[end].append(start)
dist[(start,end)] = dist[(end,start)] = int(distance)
node_in_line[start].add(line_name[0])
node_in_line[end].add(line_name[0])
print('{}--{}: {}m'.format(start, end, distance))
break
有了这些数据,意味着你可以作出个北京地铁路线图(如图1所示)
也意味着你可以做个北京地铁换乘的simple APP,示意如图2所示。
既然可以手动,那就代表:
selenium入门教学请点击Christopher‘s Github
Example: 爬取兰蔻天猫旗舰店的消费者评论
import re
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
url = 'https://detail.tmall.com/item.htm?spm=a1z10.3-b-s.w4011-14640892229.94.3c6f3c22ejqhZA&id=556028888955&rn=f74615e5cda8e547b07f67e1eb384119&abbucket=16&on_comment=1'
#这里更换目标产品评论网页(天猫兰蔻旗舰店)
driver1 = webdriver.Chrome()#谷歌自动打开之后请手动登陆
driver1.get(url) # 打开网页
import pandas as pd
time=[0]*100
review=[0]*100
#前四页的“下一页”Xpath不一样;所以先“手动”爬前四页
j=1#爬完一页,手动点击下一页,再j变2,变3,变4;这对于没规律的网站是很难爬,爬取大批量数据建议找有规律的网站,通过循环爬取
for i in range(1,21):
success =False
while not success:
try:
time[(j-1)*20+i-1]=driver1.find_element_by_xpath('//*[@id="J_Reviews"]/div/div[6]/table/tbody/tr['+str(i)+']/td[1]/div[2]').text
review[(j-1)*20+i-1]=driver1.find_element_by_xpath('//*[@id="J_Reviews"]/div/div[6]/table/tbody/tr['+str(i)+']/td[1]/div[1]/div[1]').text
success = True
except:
success=True
pass
DF1 = pd.DataFrame({"time": time, "reviews": review})
DF1.head()#由于追加评论导致格式不一致,后期数据清洗也很重要
有了消费者评论数据,你可以做
标签:express regex pattern code bre cap 关闭 爬取 rev
原文地址:https://www.cnblogs.com/ChristopherLE/p/10798579.html