码迷,mamicode.com
首页 > 编程语言 > 详细

java robots协议检测工具

时间:2016-04-12 19:18:23      阅读:256      评论:0      收藏:0      [点我收藏+]

标签:

/**
 * @(#)SearchCrawler.java, 2016年4月12日. Copyright 2016 Youdao, Inc. All rights
 *                         reserved. YOUDAO PROPRIETARY/CONFIDENTIAL. Use is
 *                         subject to license terms.
 */
package testZK;

import java.util.*;
import java.net.*;
import java.io.*;

/**
 * 
 * @author zhoukang
 */
public class SearchCrawler extends Thread {
    private HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();
    
    private List<String> urlList;
    
    private static File resultFile = new File("result.txt");
    private static BufferedWriter writer;
    static {
        try {
            if(!resultFile.exists()) {
                resultFile.createNewFile();
            }
            writer = new BufferedWriter(new FileWriter(resultFile));
        } catch (Exception e) {
            //TODO:
        }
    }

    public SearchCrawler(String str, boolean file) throws IOException{
        urlList = new ArrayList<String>();
        if (file) {
            File f = null;
            BufferedReader reader = null;
            try { 
                f = new File(str);
                reader = new BufferedReader(new FileReader(f));
                String line = "";
                while(line != null) {
                    line = reader.readLine();
                    urlList.add(line);
                }
            } catch (Exception e) {
                //TODO
            } finally {
                if(reader != null) {
                    reader.close();
                }
            }
        } else {
            urlList.add(str);
        }
    }
    
    public BufferedWriter getBufferedWriter() {
        return writer;
    }

    public void run() {
        checkUrl(urlList);
    }
    
    private void checkUrl(List<String> urls) {
        Iterator<String> urlIter = urls.iterator();
        while(urlIter.hasNext()) {
            String url = urlIter.next();
            if(url == null || url.equals("")) {
                continue;
            }
            url = removeWwwFromUrl(url);
            URL verifiedUrl = verifyUrl(url);
            System.out.println(url);
            try {
                if(isRobotAllowed(verifiedUrl)) {
                    writer.write(url+":true");
                } else {
                    writer.write(url+":false");
                }
                writer.newLine();
                writer.flush();
            } catch (Exception e) {
                //TODO:
            }
        }        
    }

    private URL verifyUrl(String url) {
        if (!url.toLowerCase().startsWith("http://"))
            return null;

        URL verifiedUrl = null;
        try {
            verifiedUrl = new URL(url);
        } catch (Exception e) {
            return null;
        }

        return verifiedUrl;
    }

    private boolean isRobotAllowed(URL urlToCheck) {
        String host = urlToCheck.getHost().toLowerCase();
        ArrayList<String> disallowList = disallowListCache.get(host);

        if (disallowList == null) {
            disallowList = new ArrayList<String>();
            try {
                URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
                BufferedReader reader = new BufferedReader(
                        new InputStreamReader(robotsFileUrl.openStream()));

                String line;
                while ((line = reader.readLine()) != null) {
                    if (line.indexOf("Disallow:") == 0) {
                        String disallowPath = line.substring("Disallow:"
                                .length());

                        int commentIndex = disallowPath.indexOf("#");
                        if (commentIndex != -1) {
                            disallowPath = disallowPath.substring(0,
                                    commentIndex);
                        }

                        disallowPath = disallowPath.trim();
                        disallowList.add(disallowPath);
                    }
                }

                disallowListCache.put(host, disallowList);
            } catch (Exception e) {
                return true; 
            }
        }

        String file = urlToCheck.getFile();
        for (int i = 0; i < disallowList.size(); i++) {
            String disallow = disallowList.get(i);
            if (file.startsWith(disallow)) {
                return false;
            }
        }

        return true;
    }

    private String removeWwwFromUrl(String url) {
        int index = url.indexOf("://www.");
        if (index != -1) {
            return url.substring(0, index + 3) + url.substring(index + 7);
        }

        return (url);
    }
    
    private static void addShutDownHook(final SearchCrawler searchCrawler) {
        Runtime.getRuntime().addShutdownHook(new Thread() {
           public void run() {
               BufferedWriter writer = searchCrawler.getBufferedWriter();
               try {
                   writer.close();
               } catch (Exception e) {
                   //TODO
                   System.out.println("Add error");
               }
           } 
        });
    }

    public static void main(String[] args) throws InterruptedException, IOException{
        if (args.length != 1 && args.length != 2 ) {
            System.out
                    .println("Usage-1:java SearchCrawler url");
            System.out
            .println("Usage-2:java SearchCrawler -f filename");
            return;
        }

        SearchCrawler crawler = null;
        if(args.length == 1) {
            crawler = new SearchCrawler(args[0], false);
        } else {
            crawler = new SearchCrawler(args[1], true);
        }
        addShutDownHook(crawler);
        crawler.setDaemon(true);
        crawler.start();
        crawler.join();  
    }
}

 

java robots协议检测工具

标签:

原文地址:http://www.cnblogs.com/cane/p/5383735.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!