package com.smilezl.scrapy;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ScrapyUrl {
/**
* 解析网页链接
* @param htmlUrl
* @throws IOException
*/
public static List<String> parserHtml(String htmlUrl) {
List<String> list = new ArrayList<String>();
try {
URL url = new URL(htmlUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
String contenttype = connection.getContentType();
String charSet = getCharset(contenttype);
if (charSet == null)
charSet = "UTF-8";
InputStreamReader isr = new InputStreamReader(connection.getInputStream(), charSet);
BufferedReader br = new BufferedReader(isr);
String str = null, rs = null;
while ((str = br.readLine()) != null) {
rs = getHref(str, htmlUrl);
if (rs != null && !list.contains(rs))
list.add(rs);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
/**
* 获取网页编码方式
* @param str
* @return
*/
public static String getCharset(String str) {
Pattern pattern = Pattern.compile("charset=.*");
Matcher matcher = pattern.matcher(str);
if (matcher.find()) {
return matcher.group(0).split("charset=")[1];
}
return null;
}
/**
* 从一行字符串中读取链接
* @param str
* @return
*/
public static String getHref(String str, String htmlUrl) {
String patternStr = "(http://|https://){1}[\\w\\.\\-/:]+";
//String patternStr = "[^\\s]*((<\\s*[aA]\\s+(href\\s*=[^>]+\\s*)>)(.*)</[aA]>).*";
Pattern pattern = Pattern.compile(patternStr);
Matcher matcher = pattern.matcher(str);
if (matcher.find()){
return matcher.group(0);
} else {
//相对位置截取
String RelPatternStr = "href=\"/.*(html){1}";
pattern = Pattern.compile(RelPatternStr);
matcher = pattern.matcher(str);
if (matcher.find()) {
return matcher.group(0).replace("href=\"/", htmlUrl);
}
}
return null;
}
/**
* 保存链接
* @param url
*/
public static void saveUrlList(String hrefurl) {
try {
Class.forName("org.postgresql.Driver").newInstance();
String url = "jdbc:postgresql://localhost:5432/mydb?useUnicode=true&characterEncoding=gbk";
Connection con = DriverManager.getConnection(url, "postgres", "password");
Statement st = con.createStatement();
List<String> list = parserHtml(hrefurl);
for (int i = 0; i < list.size(); i++) {
String sql = "insert into scrapyurl(url,type) values(‘" + list.get(i) + "‘,0)";
System.out.println(list.get(i));
st.execute(sql);
}
st.close();
con.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
saveUrlList("http://fo.ifeng.com/fojiaomeiwen/list_0/0.shtml");
}
}
原文地址:http://smilezhuolin.blog.51cto.com/7671611/1405966