爬取7160

时间：2018-01-28 11:30:35 阅读：242 评论：0 收藏：0 [点我收藏+]

标签：pil string import .com rand raw lib title sam

优化后的代码如下,

先用循环创建20个目录,然后循环写入这20个目录,每个最多写入50000

#coding=utf-8
import os
import random
import sys


import urllib.request
from bs4 import BeautifulSoup
from urllib import error
import re
ls = [‘zhenrenxiu‘,‘meinv‘,"lianglichemo",‘rentiyishu‘,‘xiaohua‘,‘lianglichemo‘]
file_list = os.listdir("d:\\craw\\")

def validateTitle(title):
	rstr = r"[\/\\\:\*\?\"\<\>\|]"  # ‘/ \ : * ? " < > |‘
	new_title = re.sub(rstr, "_", title)  # 替换为下划线
	return new_title

def get_file_name():
	file = random.sample(file_list,1)[0]
	path = ‘d://craw/‘+ str(file);
	if  os.path.isdir(path):
		total_num = len(os.listdir(‘d://‘+ str(file)))
		if total_num >= 50000:
			file = get_file_name()
	else:
		os.mkdir(path)
		print("创建目录"+ str(path))
	return str(path)+‘/‘

for j in range(1,100000):
	url_origin = "http://www.7160.com/meinv/"+str(j)
	try:
		page_obj = urllib.request.urlopen(url_origin)
		page_soup = BeautifulSoup(page_obj,‘lxml‘)
		total_page_obj = page_soup.find(text=re.compile(‘共‘)).string
		pattern = re.compile(r‘\d+‘)
		match = pattern.search(total_page_obj)

		if match == None:
			total_page = 0;
		else:
			total_page = match.group();

		for i in range(1,int(total_page)+1):
			if i == 1 :
				url = url_origin+"/index.html"
			else:
				url = url_origin+"/index_"+str(i)+".html"
			request = urllib.request.Request(url)
			try:
				res = urllib.request.urlopen(request)

				soup = BeautifulSoup(res,‘lxml‘)
				title_obj = soup.find(attrs={"class":"picmainer"})

				if title_obj is not None:
					print(url)
					title = title_obj.h1.string
					content = soup.find(‘img‘)
					src = content.get("src")
					file_name = validateTitle(title)+".jpg"
					urllib.request.urlretrieve(src, str(get_file_name())+file_name)
					print(str(get_file_name())+file_name+"保存成功")
			except Exception  as e:
				print("异常"+str(e))
	except Exception  as e:
				print("异常"+str(e))

爬取7160

标签：pil string import .com rand raw lib title sam

原文地址：https://www.cnblogs.com/php-linux/p/8370574.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行