码迷,mamicode.com
首页 > 其他好文 > 详细

xpath的常见操作

时间:2015-05-06 17:24:21      阅读:284      评论:0      收藏:0      [点我收藏+]

标签:

1. 获取某一个节点下所有的文本数据:

data = response.xpath(//div[@id="zoomcon"])
content = ‘‘.join(data.xpath(string(.)).extract())

这段代码将获取,div为某一个特定id的所有文本数据:

技术分享

 

http://www.nhfpc.gov.cn/fzs/s3576/200804/cdbda975a377456a82337dfe1cf176a1.shtml 

技术分享

 

2. 获取html几点属性的值

>>> response.xpath("//div[@id=‘zoomtime‘]").extract()      
[u<div class="content_subtitle" id="zoomtime" title="\u53d1\u5e03\u65e5\u671f\uff1a2010-10-26"><span>\u4e2d\u534e\u4eba\u6c11\u5171\u548c\u56fd\u56fd\u5bb6\u536b\u751f\u548c\u8ba1\u5212\u751f\u80b2\u59d4\u5458\u4f1a</span><span class="wzurl_tt" style="margin-left:10px;"></span><span style="margin-left:10px;">2010-10-26</span>\r\n                <span style="margin-left:30px;"></span> </div>]
>>> response.xpath("//div[@id=‘zoomtime‘]/@title").extract()
[u\u53d1\u5e03\u65e5\u671f\uff1a2010-10-26]

这里需要获取的是某一个id下,属性title的值,使用的@title就可以获取到:

技术分享

 

scrapy的项目结构:

技术分享

 

 

nhfpc.py

# -*- coding: utf-8 -*-
import scrapy
import sys
import hashlib
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from datetime import *
from common_lib import *

reload(sys)
sys.setdefaultencoding(utf-8)

class NhfpcItem(scrapy.Item):
    url = scrapy.Field()
    name = scrapy.Field()
    description = scrapy.Field()
    size = scrapy.Field()
    dateTime = scrapy.Field()
    


class NhfpcSpider(scrapy.contrib.spiders.CrawlSpider):
    name = "nhfpc"
    allowed_domains = ["nhfpc.gov.cn"]
    start_urls = (
        http://www.nhfpc.gov.cn/fzs/pzcfg/list.shtml,
        http://www.nhfpc.gov.cn/fzs/pzcfg/list_2.shtml,
        http://www.nhfpc.gov.cn/fzs/pzcfg/list_3.shtml,
        http://www.nhfpc.gov.cn/fzs/pzcfg/list_4.shtml,
        http://www.nhfpc.gov.cn/fzs/pzcfg/list_5.shtml,
        http://www.nhfpc.gov.cn/fzs/pzcfg/list_6.shtml,
        http://www.nhfpc.gov.cn/fzs/pzcfg/list_7.shtml,
    )
    

    rules = (
        Rule(
            LinkExtractor(allow=.*\d{6}/.*),
            callback=parse_item
        ),
        Rule(
            LinkExtractor(allow=.*201307.*),
            follow=True,
        ),
    )

    def parse_item(self, response):
        
        retList =  response.xpath("//div[@id=‘zoomtitle‘]/*/text()").extract()
        title = ""
        
        if len(retList) == 0: 
            retList = response.xpath("//div[@id=‘zoomtitl‘]/*/text()").extract()
            title =  retList[0].strip()
        else:
            title = retList[0].strip()
        

        content = ""
        data = response.xpath(//div[@id="zoomcon"])

        if len(data) == 0: 
            data = response.xpath(//div[@id="contentzoom"])
        content = ‘‘.join(data.xpath(string(.)).extract())


        pubTime = "1970-01-01 00:00:00"
        time = response.xpath("//div[@id=‘zoomtime‘]/@title").extract()

        if len(time) == 0 :
            time = response.xpath("//ucmspubtime/text()").extract()
        else:
            time = ‘‘.join(time).split("")[1]

        pubTime = ‘‘.join(time)
        pubTime = pubTime + " 00:00:00"
        #print pubTime

        #insertTime = datetime.now().strftime("%20y-%m-%d %H:%M:%S")
        insertTime = datetime.now()
        webSite = "nhfpc.gov.cn"
        
        values = []
        values.append(title)

        md5Url=hashlib.md5(response.url.encode(utf-8)).hexdigest()
        
        values.append(md5Url)
        values.append(pubTime)
        values.append(insertTime)
        values.append(webSite)
        values.append(content)
        values.append(response.url)
        #print values
        insertDB(values)

 

common_lib.py

#!/usr/bin/python
#-*-coding:utf-8-*-

‘‘‘
This file include all the common routine,that are needed in
the crawler project.
Author: Justnzhang @(uestczhangchao@qq.com)
Time:2014年7月28日15:03:44
‘‘‘
import os
import sys
import MySQLdb
from urllib import quote, unquote
import uuid

reload(sys)
sys.setdefaultencoding(utf-8)

def insertDB(dictData):
    print "insertDB"
    print dictData
    id = uuid.uuid1()
    try:
        conn_local = MySQLdb.connect(host=192.168.30.7,user=xxx,passwd=xxx,db=xxx,port=3306)
        conn_local.set_character_set(utf8)
        cur_local = conn_local.cursor()
        cur_local.execute(SET NAMES utf8;) 
        cur_local.execute(SET CHARACTER SET utf8;)
        cur_local.execute(SET character_set_connection=utf8;)                

        values = []
#        print values


        values.append("2")
        values.append("3")
        values.append("2014-04-11 00:00:00")
        values.append("2014-04-11 00:00:00")
        values.append("6")
        values.append("7")
        
        cur_local.execute("insert into health_policy values(NULL,%s,%s,%s,%s,%s,%s)",values)
        #print "invinsible seperator line-----------------------------------"
        conn_local.commit()
        cur_local.close()
        conn_local.close()
    except MySQLdb.Error,e:
        print "Mysql Error %d: %s" % (e.args[0], e.args[1])


if __name__ == __main__:
    values = [1,2,4]
    insertDB(values)

 

SET FOREIGN_KEY_CHECKS=0;

-- ----------------------------
-- Table structure for health_policy
-- ----------------------------
DROP TABLE IF EXISTS `health_policy`;
CREATE TABLE `health_policy` (
  `hid` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(1000) DEFAULT NULL COMMENT 政策标题,
  `md5url` varchar(1000) NOT NULL COMMENT 经过MD5加密后的URL,
  `pub_time` datetime DEFAULT NULL COMMENT 发布时间,
  `inser_time` datetime NOT NULL COMMENT 插入时间,
  `website` varchar(1000) DEFAULT NULL COMMENT 来源网站,
  `content` longtext COMMENT 政策内容,
  `url` varchar(1000) DEFAULT NULL,
  PRIMARY KEY (`hid`)
) ENGINE=InnoDB AUTO_INCREMENT=594 DEFAULT CHARSET=utf8;

xpath的常见操作

标签:

原文地址:http://www.cnblogs.com/justinzhang/p/4482170.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!