码迷,mamicode.com
首页 > 其他好文 > 详细

一种通用数据采集的schema定义形式

时间:2015-02-09 17:56:10      阅读:124      评论:0      收藏:0      [点我收藏+]

标签:

{
  "name": "凤凰金融",
  "notice": {
    "data": "attribute",
    "matcher": [
      {
        "match": "xpath",
        "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"
      }
    ],
  "comments": "网站通告"
},
"url": { "data": "attribute", "value": "http://www.fengjr.com/financing/list?type=cx"
"comments": "本平台数据的采集URL"
}, "project": { "data": "url", "url": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ], "template": "" }, "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ] }, "detail": { "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ] }, "amount": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ] } } }, "member": { "data": "sub_item", "sub_item": { "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ], "src-save": 0, "url": { "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ], "template": "" } }, "detail": { "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ] }, "amount": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ] } } }, "src-save": 1 }

 

补充:

{
  "name": "凤凰金融",
  "notice": {
    "data": "attribute",
    "matcher": [
      {
        "match": "xpath",
        "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"
      }
    ]
  },
  "url": {
    "data": "attribute",
    "value": "http://www.fengjr.com/financing/list?type=cx"
  },
  "project": {
    "data": "url",
    "url": {
      "data": "attribute",
      "matcher": [
        {
          "match": "xpath",
          "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"
        }
      ],
      "template": ""
    },
    "title": {
      "data": "attribute",
      "matcher": [
        {
          "match": "xpath",
          "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"
        }
      ]
    },
    "detail": {
      "name": "网贷列表",
      "title": {
        "data": "attribute",
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"
          }
        ]
      },
      "amount": {
        "data": "attribute",
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"
          }
        ]
      }
    }
  },
  "member": {
    "data": "sub_item",
    "sub_item": {
      "matcher": [
        {
          "match": "xpath",
          "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"
        }
      ],
      "src-save": 0,
      "url": {
        "data": "attribute",
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"
          }
        ],
        "template": ""
      }
    },
    "detail": {
      "name": "会员材料",
      "title": {
        "data": "attribute",
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"
          }
        ]
      },
      "amount": {
        "data": "attribute",
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"
          }
        ]
      }
    }
  },
  "src-save": 1,

  "crawler": {

      "handler":"httpClient|selenium",
      "results":"html|json|text",
      "next_page": {
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"
          }
         ],
        "template": ""
      },
      "history": "re-crawl|skip|stop"
    }

}

 

一种通用数据采集的schema定义形式

标签:

原文地址:http://www.cnblogs.com/feika/p/4281864.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!