标签:des style blog http io color ar os 使用
package parser;
import org.htmlparser.Parser;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
importorg.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
/**
*htmlparser取得一段html代码里面所有的链接地址和链接名称
*
*@author chenguoyong
*
*/
public class Testhtmlparser {
/**
* @param args
*/
publicstatic void main(String[] args) {
Stringhtmlcode ="<HTML><HEAD><TITLE>AAA</TITLE></HEAD><BODY>"
+"<a href=‘http://topic.csdn.net/u/20080522/14/0ff402ef-c382-499a-8213-ba6b2f550425.html‘>连接1</a>"
+"<a href=‘http://topic.csdn.net‘>连接2</a></BODY></HTML>";
//创建Parser对象根据传给字符串和指定的编码
Parserparser = Parser.createParser(htmlcode, "GBK");
//创建HtmlPage对象HtmlPage(Parser parser)
HtmlPagepage = new HtmlPage(parser);
try{
//HtmlPage extends visitor,Apply the given visitor to the current
//page.
parser.visitAllNodesWith(page);
}catch (ParserException e1) {
e1= null;
}
//所有的节点
NodeListnodelist = page.getBody();
//建立一个节点filter用于过滤节点
NodeFilterfilter = new TagNameFilter("A");
//得到所有过滤后,想要的节点
nodelist= nodelist.extractAllNodesThatMatch(filter, true);
for(int i = 0; i < nodelist.size(); i++) {
LinkTaglink = (LinkTag) nodelist.elementAt(i);
//链接地址
System.out.println(link.getAttribute("href")+ "\n");
//链接名称
System.out.println(link.getStringText());
}
}
}
结果如下:
http://topic.csdn.net/u/20080522/14/0ff402ef-c382-499a-8213-ba6b2f550425.html
连接1
http://topic.csdn.net
连接2
package parser;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
importorg.htmlparser.filters.NodeClassFilter;
importorg.htmlparser.parserapplications.StringExtractor;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
* 使用HtmlParser抓去网页内容: 要抓去页面的内容最方便的方法就是使用StringBean. 里面有几个控制页面内容的几个参数.
* 在后面的代码中会有说明. Htmlparser包中还有一个示例StringExtractor 里面有个直接得到内容的方法,
* 其中也是使用了StringBean . 另外直接解析Parser的每个标签也可以的.
*
*@author chenguoyong
*
*/
public class GetContent {
publicvoid getContentUsingStringBean(String url) {
StringBeansb = new StringBean();
sb.setLinks(true);// 是否显示web页面的连接(Links)
//为了取得页面的整洁美观一般设置上面两项为true , 如果要保持页面的原有格式, 如代码页面的空格缩进 可以设置为false
sb.setCollapse(true);// 如果是true的话把一系列空白字符用一个字符替代.
sb.setReplaceNonBreakingSpaces(true);//If true regular space
sb
.setURL("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html");
System.out.println("TheContent is :\n" + sb.getStrings());
}
publicvoid getContentUsingStringExtractor(String url, boolean link) {
//StringExtractor内部机制和上面的一样.做了一下包装
StringExtractorse = new StringExtractor(url);
Stringtext = null;
try{
text= se.extractStrings(link);
System.out.println("Thecontent is :\n" + text);
}catch (ParserException e) {
e.printStackTrace();
}
}
publicvoid getContentUsingParser(String url) {
NodeListnl;
try{
Parserp = new Parser(url);
nl= p.parse(new NodeClassFilter(BodyTag.class));
BodyTagbt = (BodyTag) nl.elementAt(0);
System.out.println(bt.toPlainTextString());// 保留原来的内容格式. 包含js代码
}catch (ParserException e) {
e.printStackTrace();
}
}
/**
* @param args
*/
publicstatic void main(String[] args) {
Stringurl = "http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html";
//newGetContent().getContentUsingParser(url);
//--------------------------------------------------
newGetContent().getContentUsingStringBean(url);
}
}
package parser;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
/**
* 基本能实现网页抓取,不过要手动输入URL 将整个html内容保存到指定文件
*
*@author chenguoyong
*
*/
public class ScrubSelectedWeb {
privatefinal static String CRLF = System.getProperty("line.separator");
/**
* @param args
*/
publicstatic void main(String[] args) {
try{
URLur = newURL("http://www.google.cn/");
InputStreaminstr = ur.openStream();
Strings, str;
BufferedReaderin = new BufferedReader(new InputStreamReader(instr));
StringBuffersb = new StringBuffer();
BufferedWriterout = new BufferedWriter(new FileWriter(
"D:/outPut.txt"));
while((s = in.readLine()) != null) {
sb.append(s+ CRLF);
}
System.out.println(sb);
str= new String(sb);
out.write(str);
out.close();
in.close();
}catch (MalformedURLException e) {
e.printStackTrace();
}catch (IOException e) {
e.printStackTrace();
}
}
}
package parser;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
importorg.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
/**
* 标题:利用htmlparser提取网页纯文本的例子
*/
public class TestHTMLParser2 {
/**
* 读取目标html内容
*
*/
publicstatic void testHtml() {
try{
StringsCurrentLine;
StringsTotalString;
sCurrentLine= "";
sTotalString= "";
java.io.InputStreaml_urlStream;
java.net.URLl_url = new java.net.URL(
"http://10.249.187.199:8083/injs100/");
java.net.HttpURLConnectionl_connection = (java.net.HttpURLConnection) l_url
.openConnection();
l_connection.connect();
l_urlStream= l_connection.getInputStream();
java.io.BufferedReaderl_reader = new java.io.BufferedReader(
newjava.io.InputStreamReader(l_urlStream));
while((sCurrentLine = l_reader.readLine()) != null) {
sTotalString+= sCurrentLine + "\r\n";
}
StringtestText = extractText(sTotalString);
}catch (Exception e) {
e.printStackTrace();
}
}
/**
* 抽取纯文本信息
* @param inputHtml:html文本
* @return
* @throws Exception
*/
publicstatic String extractText(String inputHtml) throws Exception {
StringBuffertext = new StringBuffer();
Parserparser = Parser.createParser(new String(inputHtml.getBytes(),
"GBK"),"GBK");
//遍历所有的节点
NodeListnodes = parser.extractAllNodesThatMatch(new NodeFilter() {
publicboolean accept(Node node) {
returntrue;
}
});
System.out.println(nodes.size());
for(int i = 0; i < nodes.size(); i++) {
Nodenodet = nodes.elementAt(i);
//字符串的代表性节点:节点的描述
text.append(newString(nodet.toPlainTextString().getBytes("GBK"))
+"\r\n");
}
returntext.toString();
}
/**
* 读取文件的方式/utl 来分析内容.filePath也可以是一个Url.
* @param resource :文件/Url
* @throws Exception
*/
publicstatic void test5(String resource) throws Exception {
ParsermyParser = new Parser(resource);
myParser.setEncoding("GBK");
StringfilterStr = "table";
NodeFilterfilter = new TagNameFilter(filterStr);
NodeListnodeList = myParser.extractAllNodesThatMatch(filter);
/*for(inti=0;i<nodeList.size();i++)
{
TableTagtabletag = (TableTag) nodeList.elementAt(i);
//标签名称
System.out.println(tabletag.getTagName());
System.out.println(tabletag.getText());
}*/
TableTagtabletag = (TableTag) nodeList.elementAt(1);
}
publicstatic void main(String[] args) throws Exception {
test5("http://10.249.187.199:8083/injs100/");
//testHtml();
}
}
package parser;
import org.apache.log4j.Logger;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
importorg.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
importorg.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import junit.framework.TestCase;
public class ParserTestCase extendsTestCase {
privatestatic final Logger logger = Logger.getLogger(ParserTestCase.class);
publicParserTestCase(String name) {
super(name);
}
/**
* 测试对<table>
* <tr>
* <td></td>
* </tr>
* </table>的解析
*/
publicvoid testTable() {
ParsermyParser;
NodeListnodeList = null;
myParser= Parser
.createParser(
"<body>"
+"<table id=’table1′ >"
+"<tr id=‘tro1‘><td>1-11</td><td>1-12</td><td>1-13</td></tr>"
+"<trid=‘tro2‘><td>1-21</td><td>1-22</td><td>1-23</td></tr>"
+"<trid=‘tro3‘><td>1-31</td><td>1-32</td><td>1-33</td></tr></table>"
+"<table id=’table2′ >"
+"<tr id=‘tro4‘><td>2-11</td><td>2-12</td><td>2-13</td></tr>"
+"<trid=‘tro5‘><td>2-21</td><td>2-22</td><td>2-23</td></tr>"
+"<trid=‘tro6‘><td>2-31</td><td>2-32</td><td>2-33</td></tr></table>"
+"</body>", "GBK");
NodeFiltertableFilter = new NodeClassFilter(TableTag.class);
OrFilterlastFilter = new OrFilter();
lastFilter.setPredicates(newNodeFilter[] { tableFilter });
try{
nodeList= myParser.parse(lastFilter);
for(int i = 0; i <= nodeList.size(); i++) {
if(nodeList.elementAt(i) instanceof TableTag) {
TableTagtag = (TableTag) nodeList.elementAt(i);
TableRow[]rows = tag.getRows();
for(int j = 0; j < rows.length; j++) {
TableRowtr = (TableRow) rows[j];
System.out.println(tr.getAttribute("id"));
if(tr.getAttribute("id").equalsIgnoreCase("tro1")) {
TableColumn[]td = tr.getColumns();
for(int k = 0; k < td.length; k++) {
//logger.fatal("<td>" +
//td[k].toPlainTextString());
System.out.println("<td>"
+td[k].toPlainTextString());
}
}
}
}
}
}catch (ParserException e) {
e.printStackTrace();
}
}
/**
* 得到目标数据
*
* @param url:目标url
* @throws Exception
*/
publicstatic void getDatabyUrl(String url) throws Exception {
ParsermyParser = new Parser(url);
NodeListnodeList = null;
myParser.setEncoding("gb2312");
NodeFiltertableFilter = new NodeClassFilter(TableTag.class);
OrFilterlastFilter = new OrFilter();
lastFilter.setPredicates(newNodeFilter[] { tableFilter });
try{
nodeList= myParser.parse(lastFilter);
//可以从数据table的size:19-21开始到结束
for(int i = 15; i <= nodeList.size(); i++) {
if(nodeList.elementAt(i) instanceof TableTag) {
TableTagtag = (TableTag) nodeList.elementAt(i);
TableRow[]rows = tag.getRows();
for(int j = 0; j < rows.length; j++) {
TableRowtr = (TableRow) rows[j];
if(tr.getAttribute("id") != null
&&tr.getAttribute("id").equalsIgnoreCase(
"tr02")){
TableColumn[]td = tr.getColumns();
//对不起,没有你要查询的记录!
if(td.length == 1) {
System.out.println("对不起,没有你要查询的记录");
}else {
for(int k = 0; k < td.length; k++) {
System.out.println("<td>内容:"
+td[k].toPlainTextString().trim());
}
}
}
}
}
}
}catch (ParserException e) {
e.printStackTrace();
}
}
/**
* 测试已经得出有数据时table:22个,没有数据时table:19个
*
* @param args
*/
publicstatic void main(String[] args) {
try{
//getDatabyUrl("http://gd.12530.com/user/querytonebytype.do?field=tonecode&condition=619505000000008942&type=1006&pkValue=619505000000008942");
getDatabyUrl("http://gd.12530.com/user/querytonebytype.do?field=tonecode&condition=619272000000001712&type=1006&pkValue=619272000000001712");
}catch (Exception e) {
e.printStackTrace();
}
}
}
packagecom.jscud.test;
标签:des style blog http io color ar os 使用
原文地址:http://blog.csdn.net/lanyan822/article/details/41116207