码迷,mamicode.com
首页 > 编程语言 > 详细

java 去掉html标签(Java中去掉网页HTML标记的方法 )--正则表达式

时间:2017-07-14 21:19:41      阅读:277      评论:0      收藏:0      [点我收藏+]

标签:case   ase   它的   jpg   ack   还原   内容   type   pattern   

参考:

http://www.cnblogs.com/newsouls/p/3995394.html

http://blog.csdn.net/he20101020/article/details/21228311

 

内容:

package utils;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 注:\n 回车(\u000a)
 *    \t 水平制表符(\u0009)
 *    \s 空格(\u0008)
 *    \r 换行(\u000d)
 * Created by Administrator on 2017/7/14.
 */
public class HtmlUtil {
    public static void main(String[] args) {

        String str = "<div class=\"WB_cardwrap S_bg2\">\n" +
                "  <div class=\"search_feed\">\n" +
                "    <div class=\"person_list_feed clearfix\">\n" +
                "      <div class=\"pl_personlist\">\n" +
                "        <div class=\"list_person clearfix\">\n" +
                "          <div class=\"person_pic\">\n" +
                "            <a target=\"_blank\" href=\"http://weibo.com/114dotcom?refer_flag=1001030201_\" title=\"114导航\" suda-data=\"key=tblog_search_user&value=user_feed_1_icon\">\n" +
                "              <img class=\"W_face_radius\" src=\"http://tva4.sinaimg.cn/crop.176.129.505.505.180/006aCV5fgw1f0gxs4ikuyj30ne0juq47.jpg\" uid=\"5653836249\" height=\"80\" width=\"80\" /></a>\n" +
                "          </div>\n" +
                "          <div class=\"person_detail\">\n" +
                "            <p class=\"person_name\">\n" +
                "              <a class=\"W_texta W_fb\" target=\"_blank\" href=\"http://weibo.com/114dotcom?refer_flag=1001030201_\" title=\"114导航\" uid=\"5653836249\" suda-data=\"key=tblog_search_user&value=user_feed_1_name\">114导航</a>\n" +
                "              <a target=\"_blank\" href=\"http://verified.weibo.com/verify\" title=\"微博机构认证\" alt=\"微博机构认证\" class=\"W_icon icon_approve_co\"></a>\n" +
                "            </p>\n" +
                "            <p class=\"person_addr\">\n" +
                "              <span class=\"male m_icon\" title=\"男\"></span>\n" +
                "              <span>广东</span>\n" +
                "              <a class=\"W_linkb\" target=\"_blank\" href=\"http://weibo.com/114dotcom?refer_flag=1001030201_\" class=\"wb_url\" suda-data=\"key=tblog_search_user&value=user_feed_1_url\">http://weibo.com/114dotcom</a></p>\n" +
                "            <p class=\"person_card\">\n" +
                "              <em class=\"red\">一一四网络有限公司</em></p>\n" +
                "            <p class=\"person_num\">\n" +
                "              <span>关注\n" +
                "                <a class=\"W_linkb\" href=\"http://weibo.com/5653836249/follow?refer_flag=1001030201_\" target=\"_blank\" suda-data=\"key=tblog_search_user&value=user_feed_1_num\">68</a></span>\n" +
                "              <span>粉丝\n" +
                "                <a class=\"W_linkb\" href=\"http://weibo.com/5653836249/fans?refer_flag=1001030201_\" target=\"_blank\" suda-data=\"key=tblog_search_user&value=user_feed_1_num\">118</a></span>\n" +
                "              <span>微博\n" +
                "                <a class=\"W_linkb\" href=\"http://weibo.com/5653836249/profile?refer_flag=1001030201_\" target=\"_blank\" suda-data=\"key=tblog_search_user&value=user_feed_1_num\">7</a></span>\n" +
                "            </p>\n" +
                "            <div class=\"person_info\">\n" +
                "              <p>简介: 114.com,不一样的导航,能记住吗?追求小清新,简约到极致。推荐给您的不仅是网址,更是给你需要的答案。此外,114.com还提供电话、品牌、名人、价格等各种实用查询。</p>\n" +
                "            </div>\n" +
                "            <p class=\"person_label\">标签:\n" +
                "              <a class=\"W_linkb\" href=\"&tag=%25E6%2596%25B0%25E9%2597%25BB%25E7%2583%25AD%25E7%2582%25B9&Refer=SUer_tag\" suda-data=\"key=tblog_search_user&value=user_feed_1_label\">新闻热点</a></p>\n" +
                "          </div>\n" +
                "        </div>\n" +
                "      </div>\n" +
                "    </div>\n" +
                "  </div>\n" +
                "</div>\n" +
                "<div class=\"WB_cardwrap S_bg2 relative\"></div>\n" +
                "<!-- 未登录提示 -->\n" +
                "<div class=\"search_tips clearfix\">\n" +
                "  <p class=\"tips_co\">\n" +
                "    <span class=\"tips_icon icon_warn\"></span>\n" +
                "    <span class=\"tips_txt\">\n" +
                "      <a href=\"javascript:void(0);\" action-type=\"login\">立即登录</a>查看更多结果。还没有账号?赶紧\n" +
                "      <a href=\"http://weibo.com/signup/signup.php?lang=zh-cn&amp;entry=weisousuo\" suda-data=\"key=tblog_search_v4.1&amp;value=nologin_reg\" target=\"_blank\">注册微博</a></span>\n" +
                "  </p>\n" +
                "</div>\n" +
                "<!-- /未登录提示 -->";

        System.out.println(delHTMLTag(str));
    }

    public static String delHTMLTag(String htmlStr){
        String regEx_script="<script[^>]*?>[\\s\\S]*?<\\/script>"; //定义script的正则表达式
        String regEx_style="<style[^>]*?>[\\s\\S]*?<\\/style>"; //定义style的正则表达式
        String regEx_html="<[^>]+>"; //定义HTML标签的正则表达式
        String regEx_space = "\\s*|\t|\r|\n";//定义空格回车换行符

        Pattern p_script=Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
        Matcher m_script=p_script.matcher(htmlStr);
        htmlStr=m_script.replaceAll(""); //过滤script标签

        Pattern p_style=Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
        Matcher m_style=p_style.matcher(htmlStr);
        htmlStr=m_style.replaceAll(""); //过滤style标签

        Pattern p_html=Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
        Matcher m_html=p_html.matcher(htmlStr);
        htmlStr=m_html.replaceAll(""); //过滤html标签

        Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE);
        Matcher m_space = p_space.matcher(htmlStr);
        htmlStr = m_space.replaceAll(""); // 过滤空格回车标签

        return htmlStr.trim(); //返回文本字符串
    }

    public static String stripHtml(String content) {
// <p>段落替换为换行
        content = content.replaceAll("<p .*?>", "\r\n");
// <br><br/>替换为换行
        content = content.replaceAll("<br\\s*/?>", "\r\n");
// 去掉其它的<>之间的东西
        content = content.replaceAll("\\<.*?>", "");
// 还原HTML
// content = HTMLDecoder.decode(content);
        return content;
    }

    public static String getTextFromHtml(String htmlStr){
        htmlStr = delHTMLTag(htmlStr);
        htmlStr = htmlStr.replaceAll("&nbsp;", "");
        //htmlStr = htmlStr.substring(0, htmlStr.indexOf("。")+1);
        return htmlStr;
    }

}

 

java 去掉html标签(Java中去掉网页HTML标记的方法 )--正则表达式

标签:case   ase   它的   jpg   ack   还原   内容   type   pattern   

原文地址:http://www.cnblogs.com/tangyongathuse/p/7172121.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!