码迷,mamicode.com
首页 > 其他好文 > 详细

慧聪网爬虫

时间:2017-06-05 15:51:56      阅读:294      评论:0      收藏:0      [点我收藏+]

标签:any   print   dom   .net   pen   gevent   sts   select   win   

import requests
from bs4 import BeautifulSoup
import pandas as pd
import gevent
from gevent import monkey;monkey.patch_all()
import time
import re
import random

UA_list = [
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36
,Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; GWX:MANAGED),Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0),Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0),Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; GWX:MANAGED)]

proxies_list=[{proxy: http:\\10.220.70.254:808}, {proxy: http:\\10.221.70.254:808}, {proxy: http:\\10.222.70.254:808}, {proxy: http:\\10.223.70.254:808}]

headers = {User-Agent:random.choice(UA_list),Referer:http://b2b.hc360.com/}

def diyu(sheng,shi):
    for i in range(100):
        or_url = http://s.hc360.com/?w={}&mc=enterprise&ee={}&z=%D6%D0%B9%FA%3A{}%CA%A1%3A{}.format(sheng,i+1,sheng,shi)
        res = requests.get(or_url,headers = headers,)
        soup = BeautifulSoup(res.text,lxml)
        urls = soup.select(dd.til > h3 > a)
        for url in urls:
            return url.get(href)

def url_parser(urld):
    res = requests.get(urld, headers=headers,proxies=random.choice(proxies_list),timeout=60)
    if res.status_code !=404:
        soup = BeautifulSoup(res.text, lxml)
        flag = re.findall(r公司黄页,str(soup))
        if len(flag)>0:
            return url_HYparer(soup)
        else:
            or_url = urld + shop/company.html
            res = requests.get(or_url, headers=headers,proxies=random.choice(proxies_list),timeout=60)
            soup1 = BeautifulSoup(res.text, lxml)
            flag1 = re.findall(r手机极速版,str(soup1))
            flag2 = re.findall(r未认证 , str(soup1))
            if len(flag1)>0:
                return url_SJJSparer(soup1)
            elif len(flag2)>0:
                return url_uncertifie(soup1)
            else:
                return url_NSJJSparer(soup1)

def url_NSJJSparer(soup):

    data = {
        conpany_name:soup.select(td.contitlebg > span)[0].text.strip(),
        name:soup.select(span.bluezzbold.font14)[0].text.strip(),
        address:soup.select(td.conbg.conbg2 > ul:nth-of-type(1) > li:nth-of-type(2))[0].get(title),
        phone:re.search(r\d{11}|\d{4}-\d{8},str(soup)).group()}
    return data

def url_HYparer(soup):
    data = {
    conpany_name:soup.select(div.sub-info > h1)[0].text,
    name:soup.select(samp)[0].text,
    address:soup.select(div.tableCon > div:nth-of-type(2) > ul > li:nth-of-type(3) > span.conRight)[0].text,
    phone:soup.select(div.tableCon > div:nth-of-type(2) > ul > li:nth-of-type(2) > span.conRight)[0].text
    }
    return data

def url_SJJSparer(soup):
    data = {
        conpany_name:soup.select(div.ContacCon1 > h3)[0].text.strip(),
        name:soup.select(div.ContactsName > span > a)[0].text.strip(),
        address:soup.select(div.ContacCon3 > ul > li:nth-of-type(1) > div.con3Rig)[0].text.strip(),
        phone:re.search(r\d{11}|\d{4}-\d{8},str(soup)).group()}
    return data

def url_uncertifie(soup):
    data = {
        conpany_name:soup.select(td.contitlebg_1 > span)[0].text.strip(),
        name:soup.select(span.bluezzbold.font14)[0].text.strip(),
        address:soup.select(td.conbg.conbg2 > ul:nth-of-type(1) > li:nth-of-type(2))[0].text.strip(),
        phone:re.search(r\d{11}|\d{4}-\d{8},str(soup)).group()}
    return data

if __name__==__main__:
    with open(uu.txt, r) as f:
        info_total = []
        for i in f:
            try:
                info_ary = url_parser(i.strip())
                time.sleep(random.randint(1,5))
                info_total.append(info_ary)
                print(len(info_total))
            except Exception as e:
                print(e, i.strip())
        df = pd.DataFrame(info_total)
        df.to_excel(huicong_beijing.xlsx)
        print(Done)

 

慧聪网爬虫

标签:any   print   dom   .net   pen   gevent   sts   select   win   

原文地址:http://www.cnblogs.com/Erick-L/p/6945009.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!