码迷,mamicode.com
首页 > 数据库 > 详细

猫眼电影爬取(三):requests+pyquery,并将数据存储到mysql数据库

时间:2018-06-27 22:20:35      阅读:279      评论:0      收藏:0      [点我收藏+]

标签:from   size   str   参数   exe   apple   sql数据库   pass   text   

还是以猫眼电影为例,这次用pyquery库进行爬取

1.简单demo,看看如何使用pyquery提取信息,并将提取到的数据进行组合

# coding: utf-8
# author: hmk

import requests
from pyquery import PyQuery as pq


url = http://maoyan.com/board/4
header = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
              "Accept-Encoding": "gzip, deflate, sdch",
              "Accept-Language": "zh-CN,zh;q=0.8",
              "Cache-Control": "max-age=0",
              "Connection": "keep-alive",
              "Host": "maoyan.com",
              "Referer": "http://maoyan.com/board",
              "Upgrade-Insecure-Requests": "1",
              "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"}
r = requests.get(url, headers=header)
r.encoding = r.apparent_encoding
html = r.text
print(type(html))

doc = pq(html)
# print((doc(‘dd‘).find(‘.board-index‘)))
# print(doc(‘.name‘).text())
# print(doc(‘.releasetime‘).text())
# print(doc(‘dd‘).find(‘.integer‘).text()+doc(‘.fraction‘).text())
list = []
for t in doc(dd):
   index = pq(t).find(.board-index).text()
   print(index)
   movie = pq(t).find(.name).text()
   print(movie)
   time = pq(t).find(.releasetime).text()
   print(time)
   score = pq(t).find(.integer).text() + pq(t).find(.fraction).text()
   print(score)
   list.append([index, movie, time, score])
print(list)

 

2.正式代码

# coding: utf-8
# author: hmk

import requests
from pyquery import PyQuery as pq
import pymysql.cursors


def get_html(url, header):
    try:
         r = requests.get(url=url, headers=header)
         r.encoding = r.apparent_encoding
         return r.text
    except:
        return None


def get_data(html, list_data):
    doc = pq(html)
    for t in doc(dd):
        index = pq(t).find(.board-index).text()
        print(index)
        movie = pq(t).find(.name).text()
        print(movie)
        time = pq(t).find(.releasetime).text()
        print(time)
        score = pq(t).find(.integer).text() + pq(t).find(.fraction).text()
        print(score)
        list_data.append([index, movie, time, score])


def write_sql(data):
    conn = pymysql.connect(host=localhost,
                           user=root,
                           password=123456,
                           db=test,
                           charset=utf8)
    cur = conn.cursor()

    for i in data:
        """这里的data参数是指正则匹配并处理后的列表数据(是一个大列表,包含所有电影信息,每个电影信息都存在各自的一个列表中;
        对大列表进行迭代,提取每组电影信息,这样提取到的每组电影信息都是一个小列表,然后就可以把每组电影信息写入数据库了)"""
        movie = i  # 每组电影信息,这里可以看做是准备插入数据库的每组电影数据
        sql = "insert into maoyan_movie(ranking,movie,release_time,score) values(%s, %s, %s, %s)"  # sql插入语句
        try:
            cur.execute(sql, movie)  # 执行sql语句,movie即是指要插入数据库的数据
            conn.commit()  # 插入完成后,不要忘记提交操作
            print(导入成功)
        except:
            print(导入失败)
    cur.close()  # 关闭游标
    conn.close()  # 关闭连接


def main():
    start_url = http://maoyan.com/board/4
    depth = 10  # 爬取深度(翻页)
    header = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
              "Accept-Encoding": "gzip, deflate, sdch",
              "Accept-Language": "zh-CN,zh;q=0.8",
              "Cache-Control": "max-age=0",
              "Connection": "keep-alive",
              "Host": "maoyan.com",
              "Referer": "http://maoyan.com/board",
              "Upgrade-Insecure-Requests": "1",
              "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"}

    for i in range(depth):
        url = start_url + ?offset= + str(10 * i)
        html = get_html(url, header)
        list_data = []
        get_data(html, list_data)
        write_sql(list_data)
        # print(list_data)


if __name__ == "__main__":
    main()

其实就这个例子来说,使用pyquery来提取信息是最简单省事的了,直接使用css选择器就可以把想要的数据拿到

猫眼电影爬取(三):requests+pyquery,并将数据存储到mysql数据库

标签:from   size   str   参数   exe   apple   sql数据库   pass   text   

原文地址:https://www.cnblogs.com/hanmk/p/9236033.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!