标签:c style class java tar http
本文转载自姚虎才子
今天做项目时用到java抓取网页内容,本以为很简单的一件事但是还是让我蛋疼了一会,网上资料一大堆但是都是通过url抓取网页内容,但是我要的是读取本地的html页面内容的方法,网上找不到怎么办我瞬间了!
首先还是向大家讲解一下通过url抓取网页内容吧,通过正则表达式摘取title、js、css等网页元素,代码如下:
[html]
import java.io.BufferedReader;
import
java.io.IOException;
import java.io.InputStreamReader;
import
java.net.MalformedURLException;
import java.net.URL;
import
java.util.ArrayList;
import java.util.List;
import
java.util.regex.Matcher;
import
java.util.regex.Pattern;
/**
*
*
@author yaohucaizi
*/
public class Test
{
/**
*
读取网页全部内容
*/
public
String getHtmlContent(String htmlurl)
{
URL
url;
String
temp;
StringBuffer sb = new
StringBuffer();
try
{
url = new
URL(htmlurl);
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(),
"gbk"));//
读取网页全部内容
while ((temp = in.readLine()) != null)
{
sb.append(temp);
}
in.close();
} catch (final
MalformedURLException me)
{
System.out.println("你输入的URL格式有问题!");
me.getMessage();
} catch
(final IOException e)
{
e.printStackTrace();
}
return
sb.toString();
}
/**
*
* @param s
* @return 获得网页标题
*/
public String getTitle(String s)
{
String
regex;
String title =
"";
List<String> list
= new
ArrayList<String>();
regex =
"<title>.*?</title>";
Pattern pa = Pattern.compile(regex,
Pattern.CANON_EQ);
Matcher
ma = pa.matcher(s);
while
(ma.find())
{
list.add(ma.group());
}
for (int i = 0; i <
list.size(); i++)
{
title = title + list.get(i);
}
return
outTag(title);
}
/**
*
* @param s
* @return 获得链接
*/
public List<String> getLink(String s)
{
String
regex;
List<String>
list = new
ArrayList<String>();
regex =
"<a[^>]*href=(\"([^\"]*)\"|\‘([^\‘]*)\‘|([^\\s>]*))[^>]*>(.*?)</a>";
Pattern pa = Pattern.compile(regex,
Pattern.DOTALL);
Matcher ma
= pa.matcher(s);
while
(ma.find())
{
list.add(ma.group());
}
return
list;
}
/**
*
*
@param s
* @return
获得脚本代码
*/
public
List<String> getScript(String s)
{
String
regex;
List<String>
list = new
ArrayList<String>();
regex =
"<SCRIPT.*?</SCRIPT>";
Pattern pa = Pattern.compile(regex,
Pattern.DOTALL);
Matcher ma
= pa.matcher(s);
while
(ma.find())
{
list.add(ma.group());
}
return
list;
}
public
List<String> getNews(String s)
{
String regex =
"<a.*?</a>";
Pattern pa = Pattern.compile(regex,
Pattern.DOTALL);
Matcher ma
= pa.matcher(s);
List<String> list = new
ArrayList<String>();
while (ma.find())
{
list.add(ma.group());
}
return
list;
}
/**
*
*
@param s
* @return 获得CSS
*/
public List<String> getCSS(String s)
{
String
regex;
List<String>
list = new
ArrayList<String>();
regex =
"<style.*?</style>";
Pattern pa = Pattern.compile(regex,
Pattern.DOTALL);
Matcher ma
= pa.matcher(s);
while
(ma.find())
{
list.add(ma.group());
}
return
list;
}
/**
*
*
@param s
* @return
去掉标记
*/
public
String outTag(final String s)
{
return
s.replaceAll("<.*?>", "");
}
public static
void main(String[] args) {
Test t = new Test();
String
content =
t.getHtmlContent("http://www.taobao.com");
//content = content.replaceAll("(<br>)+?", "\n");//
转化换行
//content =
content.replaceAll("<p><em>.*?</em></p>", "");//
去图片注释
System.out.println(content);
System.out.println(t.getTitle(content));
List<String> a =
t.getNews(content);
List<String> news = new
ArrayList<String>();
for (String s : a)
{
news.add(s.replaceAll("<.*?>",
""));
}
System.out.println(news);
//…… 获取js、css等操作省略
}
}
import java.io.BufferedReader;
import java.io.IOException;
import
java.io.InputStreamReader;
import java.net.MalformedURLException;
import
java.net.URL;
import java.util.ArrayList;
import java.util.List;
import
java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author yaohucaizi
*/
public class Test
{
/**
*
读取网页全部内容
*/
public String
getHtmlContent(String htmlurl) {
URL url;
String
temp;
StringBuffer sb = new
StringBuffer();
try
{
url =
new
URL(htmlurl);
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(),
"gbk"));//
读取网页全部内容
while ((temp = in.readLine()) != null)
{
sb.append(temp);
}
in.close();
} catch (final
MalformedURLException me)
{
System.out.println("你输入的URL格式有问题!");
me.getMessage();
} catch (final
IOException e)
{
e.printStackTrace();
}
return
sb.toString();
}
/**
*
* @param s
* @return
获得网页标题
*/
public String
getTitle(String s) {
String
regex;
String title =
"";
List<String> list = new
ArrayList<String>();
regex =
"<title>.*?</title>";
Pattern pa = Pattern.compile(regex,
Pattern.CANON_EQ);
Matcher ma =
pa.matcher(s);
while (ma.find())
{
list.add(ma.group());
}
for (int i = 0; i <
list.size(); i++)
{
title =
title + list.get(i);
}
return
outTag(title);
}
/**
*
* @param s
* @return
获得链接
*/
public
List<String> getLink(String s)
{
String
regex;
List<String> list =
new ArrayList<String>();
regex =
"<a[^>]*href=(\"([^\"]*)\"|\‘([^\‘]*)\‘|([^\\s>]*))[^>]*>(.*?)</a>";
Pattern pa = Pattern.compile(regex,
Pattern.DOTALL);
Matcher ma =
pa.matcher(s);
while (ma.find())
{
list.add(ma.group());
}
return
list;
}
/**
*
* @param s
* @return
获得脚本代码
*/
public
List<String> getScript(String s)
{
String
regex;
List<String> list =
new ArrayList<String>();
regex =
"<SCRIPT.*?</SCRIPT>";
Pattern pa = Pattern.compile(regex,
Pattern.DOTALL);
Matcher ma =
pa.matcher(s);
while (ma.find())
{
list.add(ma.group());
}
return
list;
}
public
List<String> getNews(String s)
{
String regex =
"<a.*?</a>";
Pattern pa =
Pattern.compile(regex,
Pattern.DOTALL);
Matcher ma =
pa.matcher(s);
List<String>
list = new
ArrayList<String>();
while
(ma.find())
{
list.add(ma.group());
}
return
list;
}
/**
*
* @param s
* @return
获得CSS
*/
public
List<String> getCSS(String s)
{
String
regex;
List<String> list =
new ArrayList<String>();
regex =
"<style.*?</style>";
Pattern pa = Pattern.compile(regex,
Pattern.DOTALL);
Matcher ma =
pa.matcher(s);
while (ma.find())
{
list.add(ma.group());
}
return
list;
}
/**
*
* @param s
* @return
去掉标记
*/
public String
outTag(final String s) {
return
s.replaceAll("<.*?>", "");
}
public static void main(String[]
args) {
Test t = new
Test();
String content =
t.getHtmlContent("http://www.taobao.com");
//content = content.replaceAll("(<br>)+?", "\n");//
转化换行
//content =
content.replaceAll("<p><em>.*?</em></p>", "");//
去图片注释
System.out.println(content);
System.out.println(t.getTitle(content));
List<String> a =
t.getNews(content);
List<String> news = new
ArrayList<String>();
for
(String s : a)
{
news.add(s.replaceAll("<.*?>",
""));
}
System.out.println(news);
//……
获取js、css等操作省略
}
}
后来我想了想我觉得读取本地和通过url读取原理不是一样的嘛,但是我尝试了好多种写法都不行,不是乱码问题就是报错,我该怎么办
老天就是这样捉弄人,功夫不负有心人当我尝试至999次时候突然眼前一亮,我成功实现读取本地html了……说真的代码不难但是你需要多次尝试,把我的代码分享给大家:
[html]
/**
* 抓取本地网页内容
*
*
@param filePath
* @return
*/
public
static String getHtmlContent(String filePath) {
String temp;
BufferedReader
br;
StringBuffer sb = new
StringBuffer();
try
{
br = new
BufferedReader(new InputStreamReader(new FileInputStream(filePath),
"GB2312"));
while ((temp =
br.readLine()) != null)
{
sb.append(temp);
}
} catch (IOException e)
{
e.printStackTrace();
}
return sb.toString();
}
/**
*
抓取本地网页内容
*
* @param
filePath
* @return
*/
public static String getHtmlContent(String filePath)
{
String
temp;
BufferedReader
br;
StringBuffer sb = new
StringBuffer();
try
{
br = new
BufferedReader(new InputStreamReader(new FileInputStream(filePath),
"GB2312"));
while ((temp = br.readLine()) != null)
{
sb.append(temp);
}
} catch (IOException e)
{
e.printStackTrace();
}
return
sb.toString();
}
标签:c style class java tar http
原文地址:http://www.cnblogs.com/abc8023/p/3757039.html