简单的网络爬虫

时间：2015-11-30 16:01:12 阅读：181 评论：0 收藏：0 [点我收藏+]

标签：

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class RegexWeb
{

    /**
     * 网页爬虫
     */
    public static void main(String[] args) throws Exception
    {
        //URL
        String str_url = "http://tieba.baidu.com/p/2314539885";
        //规则
        //String regex = "\\w+@\\w+\\.[a-zA-Z]{2,3}";
        String regex = "(\\w)+(\\.\\w+)*@(\\w)+((\\.\\w{2,3}){1,3})";
        regexForWeb(str_url,regex);
    }

    private static void regexForWeb(String str_url,String regex) throws Exception
    {

        URL url = new URL(str_url);
        //打开URL连接
        URLConnection conn = url.openConnection();
        //设置网络连接时间
        conn.setConnectTimeout(1000*10);
        //读取指定网络地址中的文件
        BufferedReader buf = new BufferedReader(new InputStreamReader(conn.getInputStream()));
        //把正则表达式转换成正则对象
        Pattern p = Pattern.compile(regex);
        //每行读取的内容
        String line = null;
        while((line=buf.readLine())!=null){
            //Pattern对象转换成Matcher对象，操作字符串
            Matcher m = p.matcher(line);
            //部分匹配
            while(m.find()){
                //返回匹配成功的部分
                System.out.println(m.group());
            }
        }
    }

}

简单的网络爬虫

标签：

原文地址：http://www.cnblogs.com/hefeisf/p/4976885.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行