package com.lxf.crawler;
import java.io.File;
import java.io.FileWriter;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import com.lxf.dao.bean.NewsBean;
import com.lxf.dao.imp.NewsDao;
import com.lxf.dao.inf.NewsDaoInf;
/**
* <爬虫程序> 从新浪新闻中爬取新闻分类、标题及内容 (需导入htmlparser.jar包)
*
* @author 刘向峰
*/
public class SinaNews {
/**
* 测试入口
*
* @param args
*/
public static void main(String args[]) {
// china world
// society media opinion
String type = "world";
File file = new File(type);
if (!file.exists())// 如果不存在就创建
{
file.mkdirs();
}
SinaNews gn = new SinaNews();
String a = gn.getNews(type);
// gn.writefile(a, "SinaNews.html",type);
}
// 抓取信息 组成良好格式
public String getNews(String type) {
NewsDaoInf dao = new NewsDao();
try {
NodeFilter filter = new TagNameFilter("ul");
Parser parser = new Parser();
Parser bodyparser = new Parser();
parser.setURL("http://news.sina.com.cn/" + type + "/");// 互联网模块的地址
// System.out.println(parser.getEncoding());
parser.setEncoding("gb2312");
NodeList list = parser.extractAllNodesThatMatch(filter);
StringBuilder newsStr = new StringBuilder(
"<!DOCTYPE html><html><head></head><body><table>");// 新闻表格字符串
SinaNews gn = new SinaNews();
for (int i = 0; i < list.size() - 1; i++) {
Tag node = (Tag) list.elementAt(i);
for (int j = 1; j < node.getChildren().size(); j++) {
String textstr = node.getChildren().elementAt(j).toHtml()
.trim();
if (textstr.length() > 0) {
int linkbegin = textstr.indexOf("href=");// 截取<a>链接字符串起始位置
int linkend = textstr.indexOf("\">");// 截取<a>链接字符串结束位置
String sublink = textstr.substring(linkbegin + 6,
linkend);
// 链接字符串
String link = "";
if (sublink.indexOf("target") != -1) {
link = sublink.substring(0, sublink.indexOf("\""));
} else {
link = sublink;// 链接字符串
}
int titlebegin = textstr.indexOf("\">");
int titleend = textstr.indexOf("</a>");
String title = textstr.substring(titlebegin + 2,
titleend).trim();
System.out.println("正在抓取: " + title);
// 通过标题判断该新闻是否已经存在
if (dao.hasNews(title)) {
System.out.println("【该记录已经存在】");
continue;
}
if (title.contains("视频:") || title.contains("视频:")) {
System.out.println("【无法获得视频新闻】");
continue;
}
if (title.contains("(图)")) {
title = title.replace("(图)", "");
}
try {
/** 新闻内容处理开始 */
NodeFilter bodyfilter = new AndFilter(
new TagNameFilter("div"),
new HasAttributeFilter("id", "artibody"));
bodyparser.setURL(link);// 地址url
// bodyparser.setEncoding(bodyparser.getEncoding());
bodyparser.setEncoding("gb2312");
NodeList bodylist = bodyparser
.extractAllNodesThatMatch(bodyfilter);
// 新闻内容字符串
if (bodylist.elementAt(0) == null) {
System.out.println("【新闻无内容】");
continue;
}
String newstextstr = bodylist.elementAt(0).toHtml()
.trim();
// 只保留正文内容,保留P标签以保持其排版
int bodybegin = newstextstr.indexOf("<p>");
int bodyend = newstextstr.lastIndexOf("</p>") + 4;
int bodyimgbegin = newstextstr
.indexOf("<div class=\"img_wrapper\">");
int bodyimgend = newstextstr
.lastIndexOf("<span class=\"img_descr\">");
String body = "";
if (bodybegin < 0) {
body = newstextstr;
} else {
body = newstextstr
.substring(bodybegin, bodyend);
}
if (bodyimgbegin >= 0) {
body = newstextstr.substring(bodyimgbegin,
bodyimgend) + "</div>" + body;
}
/** 写入数据库 */
NewsBean newsBean = new NewsBean(0, title, body,
link, link.substring(
link.lastIndexOf("/") - 10,
link.lastIndexOf("/")), type);
dao.add(newsBean);
// gn.writefile(body, link,type); // 写文件
} catch (Exception e) {
System.out.println("抓取信息子页面出错,出错信息为:");
e.printStackTrace();
/** 新闻内容处理结束 */
}
/** 将标题拼接到字符串中 */
newsStr.append("<tr><td><a target=\"_blank\" href=\""
+ link + "\">");
newsStr.append(title);
newsStr.append("</a></td></tr>");
}
}
}
newsStr.append("</table></body></html>");
return newsStr.toString();
} catch (Exception e) {
System.out.println("抓取信息出错,出错信息为:");
e.printStackTrace();
return "";
}
}
// 写文件
public void writefile(String str, String filename, String type) {
if (filename.contains(".cn/")) {
filename = type
+ "\\"
+ filename.substring(filename.indexOf(".cn/") + 4).replace(
"/", "_");
} else {
filename = type + "\\" + filename;
}
File file = new File(filename);
if (!file.exists() && filename.indexOf("/") != -1)// 如果不存在就创建
{
file.mkdirs();
}
try {
FileWriter writer = new FileWriter(filename);
writer.write(str);
writer.close();
System.out.println("成功生成新闻页面" + filename);
} catch (Exception e) {
System.out.println("将信息写入文件" + filename + "发生错误,错误信息为:");
e.printStackTrace();
}
}
}
原文地址:http://blog.csdn.net/lxf_44944/article/details/43794607