标签:表达式 str ted stack ref 正则表达式 cat string net
java版的网络爬虫基本思路是,先获取网页信息,再根据正则表达式提取网页内容
package xuexi; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class webtest { /** * 获得urlStr对应的网页的源码内容 * @param args * @throws IOException */ public static String getURLContent(String urlStr,String charset){ StringBuffer sb=new StringBuffer(); try { URL url = new URL(urlStr); BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName(charset))); String temp=""; do{ temp=reader.readLine(); sb.append(temp); //System.out.println(temp); }while(temp != null); }catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return sb.toString(); } /** * 正则表达式截取字符串 * @param destStr * @param regexStr * @return */ //参数:字符窜和正则表达式 public static List<String> getMatherSubstrs(String destStr,String regexStr){ List<String> result = new ArrayList<String>(); Pattern p=Pattern.compile(regexStr); Matcher m=p.matcher(destStr); while(m.find()){ result.add(m.group(0)); } return result; } public static void main(String[] args) throws IOException { String content=getURLContent("https://www.qq.com/","utf-8"); List<String> list=getMatherSubstrs(content,"href=\"+[\\w./:]+\""); for(String a: list){ System.out.println(a); } } }
标签:表达式 str ted stack ref 正则表达式 cat string net
原文地址:https://www.cnblogs.com/hzcjd/p/12227948.html