码迷,mamicode.com
首页 > 其他好文 > 详细

ruby爬虫模板

时间:2019-08-18 13:29:57      阅读:88      评论:0      收藏:0      [点我收藏+]

标签:put   utils   def   require   exists   code   with   EDA   tps   

require restclient
require open-uri
require open_uri_redirections
require nokogiri
require json
require yaml
require fileutils
require base64

MAX_RETRY_TIMES = 5
ROOT_DIR = /home/zn/work/small-tools-master/zlk/tu/
BASE_URL = https://newceshiao.com/mnkc/tiku/?id=

COOKIE = {:VerificationCodeNum => 1, :QZ_KSUser => UserID=15357507&UserName=ppkao1520606811&UserToken=cw05IVsvRbyxuPoQeQIU4%252bZNshdiFE%252fN6LGCVScB%252bnQLBUYAu7SA7A%253d%253d}
@cookie = VerificationCodeNum=1; PPKAO=PPKAOSTID%3D987%26PPKAOCEID%3D%26PPKAOSJID%3D%26UserName%3D%26EDays%3D

@agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/63.0.3239.84 Chrome/63.0.3239.84 Safari/537.36"
@content_type = "application/x-www-form-urlencoded"


@download_error = Logger.new(download_error.log)
@no_doc = Logger.new(nodoc_error.log)
@parse_error = Logger.new(parse_error.log)

FileUtils.makedirs(ROOT_DIR) unless File.exists?ROOT_DIR


def download_image(image)
  begin
    name = Time.now.to_i.to_s + "%04d" % [rand(10000)]
    suffix = image.sub(/.+\./, ‘‘)
    img = name + "." + suffix
    File.open("#{ROOT_DIR}/#{img}", "w") do |f|
      f.write(open("#{image}").read)
    end
  rescue  Exception => e   
    puts e.message
  end
  return img
end

def img_base64(image_src)
  file = open(image_src).read
  image = Base64.encode64(file)
end

def get_doc(search_link)
  retry_times = 0
  doc = nil

  begin
    #doc = Nokogiri::HTML(open(search_link, 
    #      "Cookie" => @cookie,
    #      "User-Agent" => @agent,
    #      "Referer" => "https://study.chinaedu.com/megrez/synchronous/list.do?gradeCode=0201&specialtyCode=02",
    #      "Host" => "study.chinaedu.com",
    #      :allow_redirections => :all
    #      ))


    #RestClient.post(url, {access_token: access_token, image: image}, {content_type: @content_type}) do |response|
    #  body = JSON.parse(response.body)
    #  return body["words_result"][0]["words"]
    #end

    RestClient.get(search_link, {:cookies => COOKIE} ) do |response|
      doc = Nokogiri::HTML(response.follow_redirection) 
    end
  rescue Exception => e
    puts e.message
    retry_times += 1
    @download_error.error "download error: #{search_link}"
    retry if retry_times < MAX_RETRY_TIMES
  end
  return doc
end

def process
  result = []
  pages = Array(18283..18583)
  pages.each_with_index do |i, index|
    link = BASE_URL + i.to_s
    puts link

    doc = get_doc(link)

    if doc.nil?
      @no_doc.error link
      next
    end

    begin
      ctg_one = doc.css(.ttop h3 a)[0].text
      ctg_two = doc.css(img)[src]
    rescue
      @parse_error.error link
      next
    end
    
    hash = Hash.new
    hash[ctg_one] = ctg_one
    hash[ctg_two] = ctg_two

    result << hash

    if (index+1)%10 == 0 || index == pages.size - 1
      File.open("result.yaml",a+){|f| YAML.dump(result, f)}
      result = []
    end

    sleep rand(4..10) 
  end
end

process

 

ruby爬虫模板

标签:put   utils   def   require   exists   code   with   EDA   tps   

原文地址:https://www.cnblogs.com/znsongshu/p/11371947.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!