# coding:utf-8
import requests
from bs4 import BeautifulSoup
import json
import time
import datetime
import pymysql
import sys
reload(sys)
sys.setdefaultencoding(‘utf-8‘)
# 获取文章内容方法
def getartinfo( url ):
page = requests.get(url).content
soup = BeautifulSoup(page,‘lxml‘)
res={}
res[‘curr‘] = soup.find(‘div‘,class_="comilla-cerrar").string.strip()
res[‘title‘] = soup.find(‘h2‘,class_="articulo-titulo").string.strip()
res[‘auchor‘] = soup.find(‘p‘,class_="articulo-autor").string.strip()
res[‘contents‘] =soup.find(‘div‘,class_="articulo-contenido")
res[‘add_time‘] = (int)(time.time())
return res
# 获取问答内容方法
def getqueinfo( url ):
page = requests.get(url).content
soup = BeautifulSoup(page,‘lxml‘)
res={}
res[‘title‘] = soup.find(‘h4‘).string.strip()
res[‘curr‘] = soup.find(‘div‘,class_="cuestion-contenido").string.strip()
res[‘auchor‘] = soup.find(‘p‘,class_="cuestion-editor").string.strip()
res[‘contents‘] =soup.find_all(‘div‘,class_="cuestion-contenido")[1]
res[‘add_time‘] = (int)(time.time())
return res
# 抓取“一个每日文章和问答”
url = "http://wufazhuce.com/"
page = requests.get(url).content
soup = BeautifulSoup(page,‘lxml‘)
# 每日文章
art_list = soup.find_all("p", class_="one-articulo-titulo")
art_url = art_list[0].a.get(‘href‘)
artinfo = getartinfo(art_url)
# 每日问答
que_list = soup.find_all("p", class_="one-cuestion-titulo")
que_url = que_list[0].a.get(‘href‘)
queinfo = getqueinfo(que_url)
que_list = list(queinfo.values())
conn = pymysql.connect(host=‘localhost‘,port=3306,user=‘root‘,password=‘root‘,db=‘one‘,charset=‘utf8‘)
cursor = conn.cursor()
cursor.execute("INSERT INTO day_art(title,curr,author,contents,add_time)VALUES(‘{0}‘,‘{1}‘,‘{2}‘,‘{3}‘,‘{4}‘);".format(artinfo[‘title‘],artinfo[‘curr‘],artinfo[‘auchor‘],artinfo[‘contents‘],artinfo[‘add_time‘]))
cursor.execute("INSERT INTO day_art(title,curr,author,contents,add_time)VALUES(‘{0}‘,‘{1}‘,‘{2}‘,‘{3}‘,‘{4}‘);".format(queinfo[‘title‘],queinfo[‘curr‘],queinfo[‘auchor‘],queinfo[‘contents‘],queinfo[‘add_time‘]))
conn.commit()
cursor.close()
conn.close()
print ‘ok‘
原文地址:http://blog.51cto.com/itafei/2074211