标签:
import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegexWeb { /** * 网页爬虫 */ public static void main(String[] args) throws Exception { //URL String str_url = "http://tieba.baidu.com/p/2314539885"; //规则 //String regex = "\\w+@\\w+\\.[a-zA-Z]{2,3}"; String regex = "(\\w)+(\\.\\w+)*@(\\w)+((\\.\\w{2,3}){1,3})"; regexForWeb(str_url,regex); } private static void regexForWeb(String str_url,String regex) throws Exception { URL url = new URL(str_url); //打开URL连接 URLConnection conn = url.openConnection(); //设置网络连接时间 conn.setConnectTimeout(1000*10); //读取指定网络地址中的文件 BufferedReader buf = new BufferedReader(new InputStreamReader(conn.getInputStream())); //把正则表达式转换成正则对象 Pattern p = Pattern.compile(regex); //每行读取的内容 String line = null; while((line=buf.readLine())!=null){ //Pattern对象转换成Matcher对象,操作字符串 Matcher m = p.matcher(line); //部分匹配 while(m.find()){ //返回匹配成功的部分 System.out.println(m.group()); } } } }
标签:
原文地址:http://www.cnblogs.com/hefeisf/p/4976885.html