win4000-spider

时间：2018-10-26 13:14:22 阅读：676 评论：0 收藏：0 [点我收藏+]

标签：gecko like 请求 __name__ main with == findall spi

import requests, re
from requests.exceptions import RequestException

lis = []
head = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64)‘
                         ‘ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36‘}
def win4000_next_page(url):
    try:
        resp = requests.get(url,headers=head)
        if resp.status_code == 200:
            return resp.text
        else:
            print(‘网站出错‘)
    except RequestException:
        print(‘请求出错‘)

def get_html():
    url = ‘http://www.win4000.com/meinv163726.html‘
    html = win4000_next_page(url)
    count = 0
    while count<= 50:##爬取张数50
        count += 1
        res1 = re.findall(r‘url="http://pic1.win4000.com/pic.+/>‘,html)
        res2 = res1[0].lstrip(‘url="‘)
        res = res2.rstrip(‘" />‘)
        page2 = re.findall(‘href=".+>下一张‘,html)
        page1 = page2[0].lstrip(‘h"‘)
        page3 = page1.lstrip(‘ref="‘)
        page = page3.rstrip(‘">下一张‘)
        lis.append(res)
        html = win4000_next_page(page)
        print(‘下一张%s‘%count)



def download(lis):
    count = 0
    for i in lis:
        count += 1
        res = grab_pic(i)
        with open(r‘C:\pythondm\spider\piclib\%s.jpg‘%count,‘wb‘) as f:##绝对路径(需要修改)
            f.write(res)
        print(‘写入完成%s‘%count)

def grab_pic(url):
    try:
        respson = requests.get(url, headers=head)
        if respson.status_code == 200:
            return respson.content
        else:
            print(‘网站出错‘)
    except RequestException:
        print(‘请求出错‘)

if __name__ == ‘__main__‘:
    get_html()
    download(lis)
    print(‘全部完成‘)

win4000-spider

标签：gecko like 请求 __name__ main with == findall spi

原文地址：https://www.cnblogs.com/sw-z/p/9855493.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行