码迷,mamicode.com
首页 > 编程语言 > 详细

爬取前尘无忧python职位信息

时间:2018-07-26 19:52:43      阅读:223      评论:0      收藏:0      [点我收藏+]

标签:join   ide   path   max   find   response   dict   字典   x11   

1.re实现

技术分享图片
 1 import re,os
 2 import requests
 3 from requests.exceptions import RequestException
 4 
 5 MAX_PAGE = 10 #最大页数
 6 KEYWORD = python
 7 headers = {
 8     User-Agent:
 9         Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
10 }
11 file_name = re_job51_python.txt
12 
13 # 获取网页源码
14 def getHtml(page):
15     try:
16         url = https://search.51job.com/list/040000,000000,0000,00,9,99,{0},2,{1}.html?.format(KEYWORD,page)
17         response = requests.get(url,headers=headers)
18         response.encoding = response.apparent_encoding
19         return response.text
20     except RequestException:
21         print(请求出错)
22         return None
23 
24 # 解析网页源码,得到目标信息
25 def getTarget(html):
26     reg = re.compile(
27         rclass="t1 ">.*? <a target="_blank" 
28         title="(.*?)".*? <span class="t2"><a target="_blank" 
29         title="(.*?)".*?<span 
30         class="t3">(.*?)</span>.*?<span 
31         class="t4">(.*?)</span>.*? <span 
32         class="t5">(.*?)</span>,
33         re.S)  # 匹配换行符
34     target = re.findall(reg,html)
35     return target
36 
37 
38 # 保存到文本中
39 def save_to_txt(item):
40     with open(file_name,a,newline=‘‘) as f:  # newline参数防止两行之间有空行
41         for i in range(len(item)):
42             # 最后一个元素换行,非最后则以',‘隔开
43             if i == len(item)-1:
44                 f.write(item[i])
45                 f.write(\n)
46             else:
47                 f.write(item[i]+,)
48 
49 def main():
50     # 每次执行前检查文件是否存在,存在则删除
51     if os.path.exists(file_name):
52         os.remove(file_name)
53 
54     # 分页爬取
55     for page in range(MAX_PAGE+1):
56         html = getHtml(page)
57         content = getTarget(html)
58         for item in content:
59             save_to_txt(item)
60 
61 if __name__ == __main__:
62     main()
View Code

 2.xpath实现

技术分享图片
  1 import os
  2 import requests
  3 from requests.exceptions import RequestException
  4 from lxml import etree
  5 import pymongo
  6 from spiders.前程无忧.mongo_config import *
  7 
  8 # mongo数据库设置
  9 client = pymongo.MongoClient(MONGO_URL)
 10 db = client[MONGO_DB]
 11 
 12 MAX_PAGE = 5
 13 KEYWORD = python
 14 headers = {
 15     User-Agent:Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)  16     Chrome/63.0.3239.132 Safari/537.36
 17 }
 18 file_name = xpath_job51_python.txt
 19 
 20 # 获取网页
 21 def get_html(page):
 22     try:
 23         url = https://search.51job.com/list/040000,000000,0000,00,9,99,{},2,{}.html?.format(KEYWORD,page)
 24         response = requests.get(url,headers=headers)
 25         response.encoding = response.apparent_encoding
 26         return response.text
 27     except RequestException:
 28         return None
 29 
 30 # 解析网页
 31 def parse_html(html):
 32     # 构造xpath解析对象,可自动修整HTML文本
 33     html = etree.HTML(html)
 34     # 获取文本 /text()
 35     # 获取属性 /@href
 36     # 获取第i个标签 /tar_name[i]  从1开始
 37     # normalize-space-->去空格换行符
 38     # position_name = html.xpath(‘normalize-space(//div[@class="el"]/p/span/a/text())‘)
 39 
 40     # 职位名称,
 41     position_names = []
 42     for name in html.xpath(//div[@class="el"]/p/span/a/text()):
 43         position_name = name.strip()
 44         position_names.append(position_name)
 45 
 46     # 职位地址
 47     position_urls = html.xpath(//div[@class="el"]/p/span/a/@href)
 48 
 49     # 公司名称
 50     company_names = html.xpath(//div[@class="el"]/span[1]/a/text())
 51 
 52     # 公司地址
 53     company_urls = html.xpath(//div[@class="el"]/span[1]/a/@href)
 54 
 55     # 位置
 56     locations = html.xpath(//div[@class="el"]/span[@class="t3"]/text())
 57 
 58     # 薪资
 59     salarys = html.xpath(//div[@class="el"]/span[@class="t4"]/text())
 60 
 61     # 发布时间
 62     release_dates = html.xpath(//div[@class="el"]/span[4]/text())
 63 
 64     result = zip(position_names,position_urls,company_names,company_urls,locations,salarys,release_dates)
 65     return result
 66 
 67 
 68 def save_to_txt(element):
 69     with open(file_name,a,newline=‘‘) as f:
 70         for i in range(len(element)):
 71             # data = ‘,‘.join(element[i])
 72             if i == len(element)-1:
 73                 f.write(element[i])
 74                 f.write(\n)
 75             else:
 76                 f.write(element[i]+,)
 77 
 78 
 79 def save_to_mongo(element):
 80     keys = [position_name,position_url,company_name,
 81             company_url,location,salary,release_date]
 82     result = dict(zip(keys,list(element)))
 83     if db[MONGO_TABLE_XPATH].insert(result):
 84         print(数据成功存储到mongo数据库中)
 85         return True
 86     return False
 87 
 88     # 遍历字典元素
 89     # for k,v in result.items():
 90     #     print(k,‘:‘,v)
 91     for key in result:
 92         print(key,:,result[key])
 93 
 94 
 95 
 96 def main():
 97     if os.path.exists(file_name):
 98         os.remove(file_name)
 99     for page in range(1,MAX_PAGE+1):
100         html = get_html(page)
101         elements = parse_html(html)
102         if elements:
103             for element in elements:
104                 save_to_txt(element)
105                 save_to_mongo(element)
106 
107 if __name__ == __main__:
108     main()
View Code

 



爬取前尘无忧python职位信息

标签:join   ide   path   max   find   response   dict   字典   x11   

原文地址:https://www.cnblogs.com/ray-mmss/p/9373742.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!