码迷,mamicode.com
首页 > Web开发 > 详细

Web爬虫入门

时间:2016-11-13 16:05:02      阅读:295      评论:0      收藏:0      [点我收藏+]

标签:抓取   nbsp   input   url   main   ext   pen   array   cat   

1.0示例学习:Web爬虫

public class WebCrawler {
    
    // 种子url
    private static String url = "http://www.cnblogs.com/";
    
    public static void main(String[] args) {
        ArrayList<String> list = crawler(url);
        System.out.println("Length of listOfPendingURLs: " + list.size());
    }

    /**
     *    根据种子URL抓取100个url
     */
    public static ArrayList<String> crawler(String StartingURL) {
        ArrayList<String> listOfPendingURLs = new ArrayList<String>();        //待抓取的url列表
        ArrayList<String> listOfTraversedURLs = new ArrayList<String>();    //已抓取的url列表
        
        listOfPendingURLs.add(StartingURL);
        while(!listOfPendingURLs.isEmpty() && listOfTraversedURLs.size() <= 100) {
            String urlString = listOfPendingURLs.remove(0);        //每次只取 待抓取url列表 的第一个地址
            if(!listOfTraversedURLs.contains(urlString)) {
                listOfTraversedURLs.add(urlString);
                System.out.println("Crawl " + urlString);        
                
                for(String s : getSubURLs(urlString)) {            //根据种子url遍历该页面所有url,并存入带抓取url列表
                    if(!listOfTraversedURLs.contains(s)) {
                        listOfPendingURLs.add(s);
                    }
                }
            }
        }
        
        return listOfPendingURLs;
    }
    
    /**
     * 抓取种子url页面的所有http链接,并返回ArrayList
     */
    public static ArrayList<String> getSubURLs(String urlString) {
        ArrayList<String> list = new ArrayList<String>();
        try {
            URL url = new URL(urlString);
            @SuppressWarnings("resource")
            Scanner input = new Scanner(url.openStream());
            int begain = 0;
            while(input.hasNextLine()) {
                String line = input.nextLine();
                begain = line.indexOf("http:", begain);
                while(begain > 0) {
                    int end = line.indexOf("\"", begain);
                    if(end > 0) {
                        list.add(line.substring(begain, end));
                        begain = line.indexOf("http:", end);
                    } else {
                        begain = 0;
                    }
                }
            }
            
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        
        return list;
    }
}

 

Web爬虫入门

标签:抓取   nbsp   input   url   main   ext   pen   array   cat   

原文地址:http://www.cnblogs.com/petersong/p/6058709.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!