码迷,mamicode.com
首页 > 数据库 > 详细

python 抓取"一个"网站文章信息放入数据库

时间:2018-03-01 11:55:26      阅读:197      评论:0      收藏:0      [点我收藏+]

标签:python   文章   爬虫   

# coding:utf-8 import requests from bs4 import BeautifulSoup import json import time import datetime import pymysql import sys reload(sys) sys.setdefaultencoding(‘utf-8‘) # 获取文章内容方法 def getartinfo( url ): page = requests.get(url).content soup = BeautifulSoup(page,‘lxml‘) res={} res[‘curr‘] = soup.find(‘div‘,class_="comilla-cerrar").string.strip() res[‘title‘] = soup.find(‘h2‘,class_="articulo-titulo").string.strip() res[‘auchor‘] = soup.find(‘p‘,class_="articulo-autor").string.strip() res[‘contents‘] =soup.find(‘div‘,class_="articulo-contenido") res[‘add_time‘] = (int)(time.time()) return res # 获取问答内容方法 def getqueinfo( url ): page = requests.get(url).content soup = BeautifulSoup(page,‘lxml‘) res={} res[‘title‘] = soup.find(‘h4‘).string.strip() res[‘curr‘] = soup.find(‘div‘,class_="cuestion-contenido").string.strip() res[‘auchor‘] = soup.find(‘p‘,class_="cuestion-editor").string.strip() res[‘contents‘] =soup.find_all(‘div‘,class_="cuestion-contenido")[1] res[‘add_time‘] = (int)(time.time()) return res # 抓取“一个每日文章和问答” url = "http://wufazhuce.com/" page = requests.get(url).content soup = BeautifulSoup(page,‘lxml‘) # 每日文章 art_list = soup.find_all("p", class_="one-articulo-titulo") art_url = art_list[0].a.get(‘href‘) artinfo = getartinfo(art_url) # 每日问答 que_list = soup.find_all("p", class_="one-cuestion-titulo") que_url = que_list[0].a.get(‘href‘) queinfo = getqueinfo(que_url) que_list = list(queinfo.values()) conn = pymysql.connect(host=‘localhost‘,port=3306,user=‘root‘,password=‘root‘,db=‘one‘,charset=‘utf8‘) cursor = conn.cursor() cursor.execute("INSERT INTO day_art(title,curr,author,contents,add_time)VALUES(‘{0}‘,‘{1}‘,‘{2}‘,‘{3}‘,‘{4}‘);".format(artinfo[‘title‘],artinfo[‘curr‘],artinfo[‘auchor‘],artinfo[‘contents‘],artinfo[‘add_time‘])) cursor.execute("INSERT INTO day_art(title,curr,author,contents,add_time)VALUES(‘{0}‘,‘{1}‘,‘{2}‘,‘{3}‘,‘{4}‘);".format(queinfo[‘title‘],queinfo[‘curr‘],queinfo[‘auchor‘],queinfo[‘contents‘],queinfo[‘add_time‘])) conn.commit() cursor.close() conn.close() print ‘ok‘

python 抓取"一个"网站文章信息放入数据库

标签:python   文章   爬虫   

原文地址:http://blog.51cto.com/itafei/2074211

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!