标签:javaweb www import open 网络 crawl http except nec
网络爬虫,web crawler(网页蜘蛛,网络机器人,网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序
最简单的网络爬虫:读取页面中所有的邮箱
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.regex.Matcher; import java.util.regex.Pattern; public class WebCrawler { public static void main(String[] args) throws IOException{ // 网址 //URL url = new URL("http://localhost:8080/JavaWeb/index.jsp");
URL url = new URL("https://www.meizu.com/contact.html"); URLConnection conn = url.openConnection(); // 转流 InputStream is = conn.getInputStream(); InputStreamReader isReader = new InputStreamReader(is); // 读取 BufferedReader bufRead = new BufferedReader(isReader); String line = null; String mailReg = "\\w+@\\w+(\\.\\w+)+"; Pattern p = Pattern.compile(mailReg); while((line=bufRead.readLine())!=null){ // 匹配 Matcher matcher = p.matcher(line); while(matcher.find()){ System.out.println(matcher.group()); } } is.close(); } }
标签:javaweb www import open 网络 crawl http except nec
原文地址:http://www.cnblogs.com/AndyHoo/p/6367562.html