标签:begin 没有 requests 输出 mod close ref html pen
#爬取电影天堂全站电影资源链接
#功能:
#1、获取电影天堂资源下载链接并输出
#2、所有链接保存为一个csv文件
import time
import requests
from bs4 import BeautifulSoup
import csv
def spider(url):
global page, No, fobj
try:
page += 1
print("第{}页".format(page))
# time.sleep(1)
#获取网页链接并读取
html = requests.get(url)
html.encoding="gbk"
html=html.text
#beautfulSoup装载文档
root=BeautifulSoup(html,"lxml")
#查找所需元素,获取tables列表
tables=root.find("div",attrs={"class":"co_content8"}).find("ul").find_all("table")
for table in tables:
name = table.find("a").text
url = "http://www.dytt8.net"+table.find("a")["href"]
# 文件写入操作
writer = csv.writer(fobj)
writer.writerow([name, url])
No += 1
print("No:", No, name, url)
#爬取下一页
# time.sleep(1)
urls=root.find("div",attrs={"class":"co_content8"}).find("div",attrs={"class":"x"}).find_all("a")
#寻找下一页的链接
for u in urls:
# print(url.text)
# try:
if u.text == "下一页":
url="https://www.dytt8.net/html/gndy/dyzz/"+u["href"]
print(url)
#如有下一页
spiderA(url)
except:#没有下一页
print("finished")
# spiderA(url)
begin_time = time.time()
url="https://www.dytt8.net/html/gndy/dyzz/index.html"
page=0
No=0
fobj=open("movies.csv", "wt", encoding="gbk", newline=‘‘)
spider(url)
fobj.close()
end_time = time.time()
time=end_time-begin_time
m,s=divmod(round(time),60)
print("用时:{}min{}s".format(m,s))
标签:begin 没有 requests 输出 mod close ref html pen
原文地址:https://www.cnblogs.com/billie52707/p/12113520.html