码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫大作业

时间:2018-04-30 18:00:26      阅读:207      评论:0      收藏:0      [点我收藏+]

标签:inf   hang   nec   ide   links   www.   png   作业   app   

代码:

#encoding=utf-8
import re
import requests
import urllib2
import datetime
import MySQLdb
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class Splider(object):
  def __init__(self):
  print u开始爬取内容...
  ##用来获取网页源代码
  def getsource(self,url):
  headers = {User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2652.0 Safari/537.36}
  req = urllib2.Request(url=url,headers=headers)
  socket = urllib2.urlopen(req)
  content = socket.read()
  socket.close()
  return content
  ##changepage用来生产不同页数的链接
  def changepage(self,url,total_page):
    now_page = int(re.search(page/(\d+),url,re.S).group(1))
  page_group = []
  for i in range(now_page,total_page+1):
    link = re.sub(page/(\d+),page/%d % i,url,re.S)
    page_group.append(link)
  return page_group
  #获取字内容
  def getchildrencon(self,child_url):
  conobj = {}
  content = self.getsource(child_url)
  soup = BeautifulSoup(content, html.parser, from_encoding=utf-8)
  content = soup.find(div,{class:c-article_content})
  img = re.findall(src="(.*?)",str(content),re.S)
  conobj[con] = content.get_text()
  conobj[img] = (;).join(img)
  return conobj
  ##获取内容
  def getcontent(self,html_doc):
  soup = BeautifulSoup(html_doc, html.parser, from_encoding=utf-8)
  tag = soup.find_all(div,{class:promo-feed-headline})
  info = {}
  i = 0
  for link in tag:
    info[i] = {}
    title_desc = link.find(h3)
    info[i][title] = title_desc.get_text()
    post_date = link.find(div,{class:post-date})
    pos_d = post_date[data-date][0:10]
    info[i][content_time] = pos_d
    info[i][source] = whowhatwear
    source_link = link.find(a,href=re.compile(r"section=fashion-trends"))
    source_url = http://www.whowhatwear.com+source_link[href]
    info[i][source_url] = source_url
    in_content = self.getsource(source_url)
    in_soup = BeautifulSoup(in_content, html.parser, from_encoding=utf-8)
    soup_content = in_soup.find(section,{class:widgets-list-content})
    info[i][content] = soup_content.get_text().strip(\n)
    text_con = in_soup.find(section,{class:text})
    summary = text_con.get_text().strip(\n) if text_con.text != None else NULL
    info[i][summary] = summary[0:200]+...;
    img_list = re.findall(src="(.*?)",str(soup_content),re.S)
    info[i][imgs] = (;).join(img_list)
    info[i][create_time] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    i+=1
  #print info
  #exit()
  return info
  def saveinfo(self,content_info):
  conn = MySQLdb.Connect(host=127.0.0.1,user=root,passwd=123456,port=3306,db=test,charset=utf8)
  cursor = conn.cursor()
  for each in content_info:
    for k,v in each.items():
    sql = "insert into t_fashion_spider2(`title`,`summary`,`content`,`content_time`,`imgs`,`source`,`source_url`,`create_time`) values (‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" % (MySQLdb.escape_string(v[title]),MySQLdb.escape_string(v[summary]),MySQLdb.escape_string(v[content]),v[content_time],v[imgs],v[source],v[source_url],v[create_time])
    cursor.execute(sql)
  conn.commit()
  cursor.close()
  conn.close()
if __name__ == __main__:
  classinfo = []
  p_num = 5
  url = http://china.nba.com/rockets/‘
  jikesplider = Splider()
  all_links = jikesplider.changepage(url,p_num)
  for link in all_links:
  print u正在处理页面: + link
  html = jikesplider.getsource(link)
  info = jikesplider.getcontent(html)
  classinfo.append(info)
  jikesplider.saveinfo(classinfo)

 

截图:

技术分享图片

 

爬虫大作业

标签:inf   hang   nec   ide   links   www.   png   作业   app   

原文地址:https://www.cnblogs.com/verson/p/8973845.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!