最近在做一个手机APP,通过一个新闻抓取程序抓取新闻,然后通过APP展示新闻。后发现手机端不支持Style标签,如果网页中有Style标签,则标签内的内容会显示处理非常影响页面美观。于是就写了一个用NekoHTML来清除Style标签的工具类
html.filter.properties 配置文件,配置允许的标签和要删除的标签及标签内的属性
attributes=style,id,name,class,width,height,src,oldsrc,complete,align,alt,title acceptTags=div,span,a,li,ul,nav,br,p,img,font,b,strong,table,tr,td removeTags=style
package com.tiamaes.gjds.util;
import java.io.IOException;
import java.util.Properties;
import org.springframework.core.io.ClassPathResource;
/**
* <p>类描述: 读取Properties中的属性 </p>
* <p>创建人:王成委 </p>
* <p>创建时间:2015年1月28日 上午11:23:27 </p>
* <p>版权说明: © 2015 Tiamaes </p>
*/
public class PropertiesUtils {
private Properties properties;
public PropertiesUtils(String path){
try {
ClassPathResource resource = new ClassPathResource(path);
properties = new Properties();
properties.load(resource.getInputStream());
} catch (IOException e) {
e.printStackTrace();
}
}
public String get(String key){
return this.properties.getProperty(key);
}
}
package com.tiamaes.gjds.util;
import java.io.CharArrayReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.cyberneko.html.filters.ElementRemover;
import org.cyberneko.html.filters.Writer;
import org.cyberneko.html.parsers.DOMParser;
import org.xml.sax.InputSource;
/**
* <p>类描述: 过滤Html中的标签 </p>
* <p>创建人:王成委 </p>
* <p>创建时间:2015年1月29日 上午10:45:02 </p>
* <p>版权说明: © 2015 Tiamaes </p>
*/
public class HtmlFilterUtils {
private static PropertiesUtils properties = null;
private static HtmlFilterUtils filter = null;
private String configPath = "html.filter.properties";
private static final String ATTRIBUTE_FIELD = "attributes";
private static final String ACCEPT_TAGS_FIELD = "acceptTags";
private static final String REMOVE_TAGS_FIELD = "removeTags";
private List<String> attributes = new ArrayList<String>();
private List<String> acceptTags = new ArrayList<String>();
private List<String> removeTags = new ArrayList<String>();
private static synchronized void syncInit(){
if(filter == null)
filter = new HtmlFilterUtils();
}
public static HtmlFilterUtils getInstance(){
return getInstance(false);
}
public static HtmlFilterUtils getInstance(boolean createNew){
if(createNew)return new HtmlFilterUtils();
if(filter == null){
syncInit();
}
return filter;
}
private HtmlFilterUtils(){
if(properties == null){
properties = new PropertiesUtils(configPath);
}
this.addToList(attributes, properties.get(ATTRIBUTE_FIELD));
this.addToList(acceptTags, properties.get(ACCEPT_TAGS_FIELD));
this.addToList(removeTags, properties.get(REMOVE_TAGS_FIELD));
}
public void addAtributes(String attrName){
this.attributes.add(attrName);
}
public void removeAtributes(String attrName){
this.attributes.remove(attrName);
}
public void addRmoveTag(String tagName){
this.removeTags.add(tagName);
}
public void removeRmoveTag(String tagName){
this.removeTags.remove(tagName);
}
public void addAcceptTag(String tagName){
this.acceptTags.add(tagName);
}
public void removeAcceptTag(String tagName){
this.acceptTags.remove(tagName);
}
private void addToList(List<String> list,String sources){
if(list == null) list = new ArrayList<String>();
String[] sourcesArray = sources.split(",");
for(String str:sourcesArray){
list.add(str);
}
}
public String doFilter(String htmlCode){
ElementRemover remover = new ElementRemover();
String[] atrrs = new String[attributes.size()];
for(String tag : acceptTags)remover.acceptElement(tag,attributes.toArray(atrrs));
for(String tag : removeTags)remover.removeElement(tag);
CharArrayReader reader = null;
String result;
try {
StringWriter filteredDescription = new StringWriter();
Writer writer = new Writer(filteredDescription,"UTF-8");
XMLDocumentFilter[] filters = {remover,writer};
DOMParser parser = new DOMParser();
reader = new CharArrayReader(htmlCode.toCharArray());
InputSource inputSource = new InputSource(reader);
parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
parser.parse(inputSource);
result = filteredDescription.toString();
} catch (Exception e1) {
e1.printStackTrace();
result = htmlCode;
}
try {
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
}
调用doFilter可以过滤HTML的内容
原文地址:http://blog.csdn.net/jaune161/article/details/43561577