码迷,mamicode.com
首页 > 其他好文 > 详细

dogedoge浏览器爬取标题

时间:2020-04-24 21:40:14      阅读:451      评论:0      收藏:0      [点我收藏+]

标签:spl   host   format   break   import   pass   encoding   exception   lxml   

# coding:utf-8
import hashlib

import datetime
import lxml
import pymysql
import requests

from lxml import etree
import sys

reload(sys)

sys.setdefaultencoding(utf-8)

def search_data(kw, n):
    ll = []
    res = requests.get(https://www.dogedoge.com/results?q={}.format(kw))
    if n > 1:
        res = requests.get(https://www.dogedoge.com/results?q={}&p={}.format(kw, n))
    con = etree.HTML(res.text)
    url = con.xpath(//div[@class="result results_links_deep highlight_d result--url-above-snippet"])
    for u in url:
        title = ‘‘
        for i in u.xpath(./div/h2/a//text()):
            title += i
        url = ‘‘
        for i in u.xpath(./div/div/div/a/span//text()):
            url += i
        domain = ‘‘
        if url.find(http) != -1:
            domain = url.split(/)[2]
        else:
            domain = url.split(/)[0]
        md5 = hashlib.md5(url).hexdigest()
        item = {}
        item[keywd] = kw
        item[domain] = domain
        item[title] = title
        item[md5] = md5
        item[url] = url
        item[searcher] = dogedoge
        ll.append(item)
    save(ll)
    try:
        next = con.xpath(//div[@id="rld-2"])
    except:
        print 没有下一页了
        return ‘‘
    else:
        return next


def main(kw):
    n = 1
    while True:
        next_page = search_data(kw, n)
        if not next_page:
            break
        n += 1


def save(ll):
    db = pymysql.connect(
        host=MYSQL_HOST,
        db=MYSQL_DBNAME,
        user=MYSQL_USER,
        passwd=MYSQL_PASSWD,
        charset=utf8,
        use_unicode=True)
    cursor = db.cursor()
    for item in ll:
        # print type(item), item[‘searcher‘]
        try:
            # 插入数据库
            cursor.execute(
                "insert into weixintb(md5,keyword,title,url,`date`,`domain`, browser) value(%s, %s, %s, %s, %s, %s,%s)",
                (item[md5],
                 item[keywd],
                 item[title],
                 item[url],
                 datetime.datetime.now(),
                 item[domain],
                 item[searcher]
                 ))
            # 提交sql语句
            db.commit()

        except Exception as error:
            # 出现错误时打印错误日志
            # print error
            # logger.error(error)
            db.rollback()
    cursor.close()
    db.close()

main(‘爬取关键词)

 

dogedoge浏览器爬取标题

标签:spl   host   format   break   import   pass   encoding   exception   lxml   

原文地址:https://www.cnblogs.com/qxh-beijing2016/p/12770181.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!