码迷,mamicode.com
首页 > Web开发 > 详细

爬网页获取电话号码

时间:2018-08-07 00:31:16      阅读:465      评论:0      收藏:0      [点我收藏+]

标签:int   while   app   match   new   lse   exce   htm   readline   

明天补充

 1 import java.io.BufferedReader;
 2 import java.io.IOException;
 3 import java.io.InputStream;
 4 import java.io.InputStreamReader;
 5 import java.net.URL;
 6 import java.net.URLConnection;
 7 import java.util.regex.Matcher;
 8 import java.util.regex.Pattern;
 9 
10 public class URLTest {
11 
12     private URLConnection connection = null;
13     
14     public String getDoucument(String url) throws IOException {
15         
16         URL newUrl = new URL(url);
17         connection = newUrl.openConnection();
18         InputStream is = connection.getInputStream();
19         InputStreamReader isr = new InputStreamReader(is);
20         BufferedReader br = new BufferedReader(isr);
21         
22         String line = "";
23         StringBuffer sb = new StringBuffer();
24         
25         while( (line =  br.readLine()) != null) {
26             sb.append(line +"\n");
27         }
28         
29         return sb.toString();
30         
31     }
32     
33     public String MyFilter( String string ) {//内容过滤器-获取网页上的电话,没有去重
34         
35         String regex = "18[\\d]{9}";
36         Pattern pattern = Pattern.compile(regex);
37         Matcher matcher = pattern.matcher(string);
38         
39         String result = "";
40         while(matcher.find()) {
41             result += matcher.group()+"\n";
42         }
43         
44         return result;
45     }
46     
47     public static void main(String[] args) throws IOException {
48         
49         URLTest test = new URLTest();
50         String page = test.getDoucument("https://power.baidu.com/question/1511959129473915060.html?qbl=relate_question_1");
51         String result = test.MyFilter(page);
52 //        System.out.println(page);
53         
54         if ( result != null ) {
55             System.out.println("从该网页上找到的号码:\n"+result);
56         }
57         else {
58             System.out.println("该网页上没有电话号码");
59         }
60         
61     }
62 }

 

程序效果

技术分享图片

爬网页获取电话号码

标签:int   while   app   match   new   lse   exce   htm   readline   

原文地址:https://www.cnblogs.com/ynhwl/p/9434230.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!