码迷,mamicode.com
首页 > Web开发 > 详细

jsoup httpclient 爬取网页并下载google图标

时间:2014-12-07 23:04:12      阅读:316      评论:0      收藏:0      [点我收藏+]

标签:des   style   blog   http   io   ar   color   os   使用   

jsoup下载地址 http://www.jsoup.org

httpclient下载地址 http://hc.apache.org/downloads.cgi

其他jar包见附件

Crawler

package jsoup;  
  
import java.io.File;  
import java.io.FileOutputStream;  
import java.io.IOException;  
import java.io.InputStream;  
import java.util.HashMap;  
import java.util.Map;  
  
import org.apache.commons.io.FileUtils;  
import org.apache.commons.io.IOUtils;  
import org.apache.http.HttpEntity;  
import org.apache.http.HttpResponse;  
import org.apache.http.HttpStatus;  
import org.apache.http.client.methods.HttpGet;  
import org.apache.http.impl.client.DefaultHttpClient;  
import org.apache.http.params.HttpProtocolParams;  
import org.apache.http.util.EntityUtils;  
  
import com.google.api.translate.Language;  
import com.google.api.translate.Translate;  
  
/** 
 * google logo 下载程序 
 */  
public abstract class Crawler {  
  
    /** 
     * 使用google 翻译api 
     *  
     * @param en 
     * @return 
     */  
    public String translateEnToCinese(String en) {  
        Translate.setHttpReferrer("http://www.xxx.com");  
        try {  
            return Translate.execute(en, Language.ENGLISH, Language.CHINESE);  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
        return "";  
    }  
  
    /** 
     * 获取一个Map 
     *  
     * @return 
     */  
    public Map<String, Object> getMap() {  
        return new HashMap<String, Object>(0);  
    }  
  
    /** 
     * 下载文件 
     *  
     * @param url 
     *            文件http地址 
     * @param dir 
     *            目标文件 
     * @throws IOException 
     */  
    public void downloadFile(String url, String dir) throws Exception {  
        DefaultHttpClient httpClient = new DefaultHttpClient();  
        HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
        HttpGet httpGet = new HttpGet();  
        httpGet.setURI(new java.net.URI(url));  
          
        InputStream input = null;  
        FileOutputStream output = null;  
        try {  
            HttpResponse response = httpClient.execute(httpGet);  
            HttpEntity entity = response.getEntity();  
            input = entity.getContent();  
            File file = new File(dir);  
            output = FileUtils.openOutputStream(file);  
            IOUtils.copy(input, output);  
        } catch (Exception e){  
            e.printStackTrace();  
        } finally {  
            IOUtils.closeQuietly(output);  
            IOUtils.closeQuietly(input);  
        }  
    }  
  
    /** 
     * 处理GET请求,返回整个页面 
     *  
     * @param url 
     *            访问地址 
     * @param params 
     *            编码参数 
     * @return 
     * @throws Exception 
     */  
    public synchronized String doGet(String url, String... params)  
            throws Exception {  
        DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例  
        HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
        String charset = "UTF-8";  
        if (null != params && params.length >= 1) {  
            charset = params[0];  
        }  
        HttpGet httpGet = new HttpGet(); // 创建get方法实例  
        String content = "";  
        httpGet.setURI(new java.net.URI(url));  
        try {  
            HttpResponse response = httpClient.execute(httpGet); // 执行请求,得到response对象  
            int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码  
            if (resStatu == HttpStatus.SC_OK) { // 200正常  
                HttpEntity entity = response.getEntity(); // 获得相应的实体  
                if (entity != null) {  
                    // 使用EntityUtils的toString方法,传递默认编码,在EntityUtils中的默认编码是ISO-8859-1  
                    content = EntityUtils.toString(entity, charset);  
                }  
            }  
        } catch (Exception e) {  
            System.out.println("访问【" + url + "】出现异常!");  
            e.printStackTrace();  
        } finally {  
            // 关闭资源  
            httpGet.abort();  
            httpClient.getConnectionManager().shutdown();  
        }  
        return content;  
    }  
}  

GoogleLogoCrawler

package jsoup;  
  
import java.io.File;  
import java.io.IOException;  
import java.util.ArrayList;  
import java.util.Date;  
import java.util.List;  
import java.util.Map;  
  
import org.apache.commons.io.FileUtils;  
import org.apache.commons.lang.StringUtils;  
import org.json.JSONArray;  
import org.json.JSONObject;  
import org.jsoup.Jsoup;  
import org.jsoup.nodes.Document;  
import org.jsoup.nodes.Element;  
import org.jsoup.select.Elements;  
  
/** 
 * google logo 下载程序 
 */  
public class GoogleLogoCrawler extends Crawler {  
      
    private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";   
  
    private static final String LOGO_URL = "http://www.logocollect.com/google/";  
  
    private static final String[] YEARS = new String[] {   
            //"1998", "1999", "2000",  
            //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",   
            "2009", "2010", "2011", "2012" };  
  
    private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";   
  
    private static final String DIR_PATH = "D:\\googlelogos\\";  
  
    public void doStart() {  
        JSONArray array = new JSONArray();  
        for (String year : YEARS) {  
            String ind = INDEX.replaceAll("%y", year);  
            int pageCount = getPageCount(ind);  
            for (int i = 1; i < pageCount+1; i++) {  
                String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");  
                String path = year + "_" + i;  
                start(url, array, DIR_PATH + path + "\\", path);  
            }  
        }  
        try {  
            FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");  
        } catch (IOException e) {  
            e.printStackTrace();  
        }  
        System.out.println(array);  
    }  
      
    public int getPageCount(String url) {  
        int pageCount = 1;  
        try {  
            org.jsoup.nodes.Document doc = Jsoup.connect(url).get();  
              
            String els = doc.html().toString();  
            int start = els.indexOf("总页数") + 4;  
            String temp = els.substring(start);  
            int end = temp.indexOf(",");  
            pageCount = Integer.parseInt(els.substring(start,start+end));  
            System.out.println(pageCount);  
        } catch (IOException e) {  
            e.printStackTrace();  
        }  
        return pageCount;  
    }  
  
    public void start(String url, JSONArray array, String dir, String path) {  
        try {  
            String content = super.doGet(url);  
            Document doc = Jsoup.parse(content);  
            Elements dds = doc.select(".img img");  
            List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);  
            for (int i = 0; i < dds.size(); i++) {  
                Element img = dds.get(i);  
                String src = img.select("img").first().attr("src");  
                String title = img.select("img").first().attr("title");  
                Map<String, Object> map = super.getMap();  
                  
                map.put("url", LOGO_URL + src);  
                map.put("title", title);  
                  
                list.add(map);  
            }  
            JSONArray tempJsonArray = new JSONArray();  
            for (Map<String, Object> map : list) {  
                JSONObject jsonObject = new JSONObject();  
                String proxy = StringUtils.substringAfterLast(map.get("url")  
                        .toString(), ".");  
                long date = new Date().getTime();  
                String name = date + "." + proxy;  
                jsonObject.put("url", map.get("url").toString());  
                jsonObject.put("dir", name);  
                jsonObject.put("title", map.get("title").toString());  
                  
                // 翻译  
//              String dateZh = super.translateEnToCinese(map.get("date")  
//                      .toString());  
//              String titleZh = super.translateEnToCinese(map.get("title")  
//                      .toString());  
//              json.put("title_zh_cn", dateZh + " - " + titleZh);  
                  
                // 下载图片  
                super.downloadFile(map.get("url").toString(), dir + name);  
                tempJsonArray.put(jsonObject);  
            }  
            array.put(new JSONObject().put(path, tempJsonArray));  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
    }  
  
    public static void main(String[] args) throws Exception {  
        new GoogleLogoCrawler().doStart();  
    }  
  
}  

本文转自:http://you-java.iteye.com/blog/1460271

jsoup httpclient 爬取网页并下载google图标

标签:des   style   blog   http   io   ar   color   os   使用   

原文地址:http://www.cnblogs.com/dreammyle/p/4150003.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!