结构化网页内容抽取方法

时间：2020-01-03 19:46:50 阅读：179 评论：0 收藏：0 [点我收藏+]

标签：代码 ignore oar controls tostring group length ror value

为了从几个网站抽取内容，聚合到一起。我于2012年写了一个程序，从多个网站通过结构化方法抽取内容。然后写入数据库，形成一个网站。

（1）正则表达式抽取

首先，从数据库中读取内容抽取规则：

ArrayList<RuleBean> rbList = ruleDao.QueryAllRule();

表结构如下：

配置的抽取规则如下：

技术图片

其次，读取网页内容，并通过起始标签抽取出内容，然后通过正则表达式读取出网址URL、标题和发表时间。

直接上代码如下：

private static void doCrawl(RuleBean rb) {

String urlContent = getUrlContent(rb.getCrawlUrl(),rb.getEncode());

if("error".equalsIgnoreCase(urlContent)){

return;

}

String contentArea = getContentArea(urlContent, rb.getAreaBegin(),

rb.getAreaEnd());

Pattern pt = Pattern.compile(rb.getRegex());

Matcher mt = pt.matcher(contentArea);

TitleAndUrlBean tuBean;

while (mt.find()) {

tuBean = new TitleAndUrlBean();

tuBean.setAppName(rb.getAppName());

tuBean.setInfoArea(rb.getInfoArea());

String rowContent = mt.group();

rowContent = rowContent.replaceAll(rb.getRemoveRegex(), "");

// 获取标题

Matcher title = Pattern.compile(rb.getTitleRegex()).matcher(

rowContent);

while (title.find()) {

String s = title.group().replaceAll("<u>|</u>|>|</a>|\\[.*?\\]|</l>","");

if(s ==null || s.trim().length()<=0){

s = "error";

}

tuBean.setTitle(s);

}

// 获取网址

Matcher myurl = Pattern.compile(rb.getUrlRegex()).matcher(

rowContent);

while (myurl.find()) {

String u = myurl.group().replaceAll(

u = u.replaceAll("\‘|\\\\", "");

if(u!=null && (u.indexOf("http://")==-1)){

tuBean.setUrl(rb.getPrefix() + u);

}else{

tuBean.setUrl(u);

}

if(tuBean.getUrl() ==null){

tuBean.setUrl("error");

}

// 获取时间

Matcher d = Pattern.compile(rb.getDateRegex()).matcher(rowContent);

while (d.find()) {

tuBean.setDeliveryDate(d.group());

}

boolean r = TitleAndUrlDAO.Add(tuBean);

if (r){

log.info("crawl add " + tuBean.getAppName() + "_"

+ tuBean.getInfoArea()+"_"+tuBean.getTitle());

if(tuBean.getAppName().contains("jww")){

Cache cTeach = CacheManager.getCacheInfo("index_teach");

if(cTeach!=null){

teachList = (List<TitleAndUrlBean>) cTeach.getValue();

}

teachList.add(tuBean);

if(teachList.size()>5){

teachList.remove(0);

}

cTeach.setValue(teachList);

cTeach.setTimeOut(-1);

CacheManager.putCache("index_teach", cTeach);

}

System.out.println("end crawl "+rb.getCrawlUrl());

}

(2) dwr返回内容的抽取

在当时dwr是比较流行的技术，为了抽取dwr的内容，着实花了一番功夫。

首先通过httpClient获取内容

public static void startCrawl() throws Exception{

DefaultHttpClient httpclient = new DefaultHttpClient();

HttpResponse response = null;

HttpEntity entity = null;

httpclient.getParams().setParameter(ClientPNames.COOKIE_POLICY,

CookiePolicy.BROWSER_COMPATIBILITY);

HttpPost httpost = new HttpPost(

"http://XXXXXXXXXX/Tzgg.getMhggllList.dwr");

List<NameValuePair> nvps = new ArrayList<NameValuePair>();

nvps.add(new BasicNameValuePair("callCount", "1"));

nvps.add(new BasicNameValuePair("page", "/oa/tzggbmh.do"));

nvps.add(new BasicNameValuePair("c0-scriptName", "Tzgg"));

nvps.add(new BasicNameValuePair("c0-methodName", "getMhggllList"));

nvps.add(new BasicNameValuePair("c0-id", "0"));

nvps.add(new BasicNameValuePair("c0-e1", "string:0"));

nvps.add(new BasicNameValuePair("c0-e2", "string:0"));

nvps.add(new BasicNameValuePair("c0-e4", "string:%20%20"));

nvps.add(new BasicNameValuePair("c0-e5", "string:rsTable"));

nvps.add(new BasicNameValuePair(

"c0-param0",

"Array:[reference:c0-e1,reference:c0-e2,reference:c0-e3,reference:c0-e4,reference:c0-e5]"));

nvps.add(new BasicNameValuePair("c0-e6", "number:20"));

nvps.add(new BasicNameValuePair("c0-e7", "number:1"));

nvps.add(new BasicNameValuePair("c0-param1",

"Object_Object:{pageSize:reference:c0-e6, currentPage:reference:c0-e7}"));

nvps.add(new BasicNameValuePair("batchId", "0"));

int infoArea = 1;

while(infoArea <4){

nvps.add(new BasicNameValuePair("c0-e3", "string:0"+infoArea));

httpost.setEntity(new UrlEncodedFormEntity(nvps));

response = httpclient.execute(httpost);

entity = response.getEntity();

try {

String responseString = null;

if (response.getEntity() != null) {

responseString = EntityUtils.toString(response.getEntity());

if(1 == infoArea){

extractData(responseString,"事务通知");

infoArea = 3;

}else if(infoArea == 3){

extractData(responseString,"公告公示");

infoArea = 100;

}

} finally {

}

httpclient.getConnectionManager().shutdown();

}

然后通过正则表达式抽取

private static void extractData(String content,String infoArea) throws Exception{

TitleAndUrlDAO tuDao = new TitleAndUrlDAO();

TitleAndUrlBean tuBean;

Pattern pt = Pattern.compile("llcs.*?a>");

Matcher mt = pt.matcher(content);

Cache c = new Cache();

while (mt.find()) {

tuBean = new TitleAndUrlBean();

tuBean.setAppName("info_xb");

tuBean.setInfoArea(infoArea);

String s2 = mt.group();

// 获取标题

Matcher title = Pattern.compile("title.*?>").matcher(s2);

while (title.find()) {

String s = title.group().replaceAll("title=|>", "");

tuBean.setTitle(unicodeToString(s));

}

// 获取网址

// Matcher myurl = Pattern.compile("href=.*?>").matcher(mt.group());

Matcher myurl = Pattern.compile("ID=.*?;").matcher(s2);

while (myurl.find()) {

String prefix = "http://XXXXXXXXX/tzggbmh.do?theAction=view&parameter.id=";

tuBean.setUrl(prefix + myurl.group().replaceAll("ID=|;|\"", ""));

}

// 获取时间

Matcher d = Pattern.compile("[0-9]{4}-[0-9]{2}-[0-9]{1,2}")

.matcher(s2);

while (d.find()) {

tuBean.setDeliveryDate(d.group());

}

boolean r = tuDao.Add(tuBean);

if (r){

log.info("crawl add " + tuBean.getAppName() + "_"

+ tuBean.getInfoArea()+"_"+tuBean.getTitle());

Cache cNotice = CacheManager.getCacheInfo("index_notice");

if(cNotice!=null){

xb_noticeList = (List<TitleAndUrlBean>) cNotice.getValue();

}

xb_noticeList.add(tuBean);

if(xb_noticeList.size()>5){

xb_noticeList.remove(0);

}

c.setValue(xb_noticeList);

c.setTimeOut(-1);

CacheManager.putCache("index_notice", c);

}

本文使用的抽取方法代码，写于2012年，每次网站结构变化的时候需要重新配置规则。

不知道这么多年过来，是否有智能的方法获取网站这种半结构化数据。

如果有，请留言告知，谢谢！

结构化网页内容抽取方法

标签：代码 ignore oar controls tostring group length ror value

原文地址：https://www.cnblogs.com/siweihz/p/12146167.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行