标签:快速 标题 attr att 间隔 sts blank jsoup tor
webmagic简介:
WebMagic是一个简单灵活的Java爬虫框架。你可以快速开发出一个高效、易维护的爬虫。
准备工作:
Maven依赖(我这里用的Maven创建的web项目做测试):
<dependencies> <!-- junit --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> <scope>test</scope> </dependency> <!--日志配置 --> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.7.12</version> </dependency> <dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-core</artifactId> <version>1.2.3</version> </dependency> <!-- 实现slf4j接口并整合 --> <dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-classic</artifactId> <version>1.2.3</version> </dependency> <!-- 数据库部分 --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.34</version> <scope>runtime</scope> </dependency> <!-- c3p0连接池 --> <dependency> <groupId>c3p0</groupId> <artifactId>c3p0</artifactId> <version>0.9.1.2</version> </dependency> <!-- dao框架:mybatis --> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis</artifactId> <version>3.4.0</version> </dependency> <!-- mybatis 整合spring --> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis-spring</artifactId> <version>1.3.0</version> </dependency> <!-- servlet web依赖 --> <dependency> <groupId>taglibs</groupId> <artifactId>standard</artifactId> <version>1.1.2</version> </dependency> <dependency> <groupId>jstl</groupId> <artifactId>jstl</artifactId> <version>1.2</version> </dependency> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-databind</artifactId> <version>2.5.1</version> </dependency> <dependency> <groupId>javax.servlet</groupId> <artifactId>javax.servlet-api</artifactId> <version>3.1.0</version> </dependency> <!-- spring 依赖 --> <!-- 1.spring核心依赖 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-core</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-beans</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-context</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- 2.spring dao 层依赖 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-jdbc</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-tx</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- spring web --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-web</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-webmvc</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- spring test 依赖 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-test</artifactId> <version>4.2.6.RELEASE</version> </dependency> <!-- webmagic 网络爬虫jar --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> </dependencies>
数据库表SQL:
CREATE TABLE `Boke` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT ‘id‘, `title` varchar(255) DEFAULT NULL COMMENT ‘标题‘, `linke` varchar(255) DEFAULT NULL COMMENT ‘正文地址‘, `author` varchar(255) DEFAULT NULL COMMENT ‘作者‘, `authorUrl` varchar(255) DEFAULT NULL COMMENT ‘作者主页‘, `summary` varchar(1000) DEFAULT NULL COMMENT ‘简介‘, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=4890 DEFAULT CHARSET=utf8;
数据库链接工具类:
import java.sql.DriverManager;
import java.sql.SQLException;
import com.mysql.jdbc.Connection;
public class MySqlJdbcUtils {
private static String driver = "com.mysql.jdbc.Driver";
private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8";
private static String name="tradingbp";
private static String pwd="123456";
/**
*
* 获取链接
*
* @date 2017年8月31日
* @return
*/
public static Connection getOpenConnection(){
Connection conn= null;
try {
//加载驱动
Class.forName(driver);
conn=(Connection) DriverManager.getConnection(url, name, pwd);
System.out.println("获得数据库链接");
} catch (ClassNotFoundException e) {
e.printStackTrace();
}catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
public static void main(String[] args) {
getOpenConnection();
}
}
import java.sql.DriverManager; import java.sql.SQLException; import com.mysql.jdbc.Connection; public class MySqlJdbcUtils { private static String driver = "com.mysql.jdbc.Driver"; private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8"; private static String name="tradingbp"; private static String pwd="123456"; /** * * 获取链接 * * @date 2017年8月31日 * @return */ public static Connection getOpenConnection(){ Connection conn= null; try { //加载驱动 Class.forName(driver); conn=(Connection) DriverManager.getConnection(url, name, pwd); System.out.println("获得数据库链接"); } catch (ClassNotFoundException e) { e.printStackTrace(); }catch (SQLException e) { e.printStackTrace(); } return conn; } public static void main(String[] args) { getOpenConnection(); } }
实体类:
/**
*
*java 博客实体
*
* @date 2017年8月24日
* @see [相关类/方法]
* @since [产品/模块版本]
*/
public class JavaBokeModel {
//标题
private String title;
//链接地址
private String linke;
//作者
private String author;
//作者主页地址
private String authorUrl;
//简介
private String summary;
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getLinke() {
return linke;
}
public void setLinke(String linke) {
this.linke = linke;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getAuthorUrl() {
return authorUrl;
}
public void setAuthorUrl(String authorUrl) {
this.authorUrl = authorUrl;
}
}
webmagic 框架爬取数据并保存
import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.ArrayList; import java.util.Date; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import com.mysql.jdbc.Connection; import com.nio.webmagic.jdbc.MySqlJdbcUtils; import com.nio.webmagic.model.JavaBokeModel; /** * * 爬虫 * * @version [VCES V201R001, 2017年10月12日] * * @see 方法实现 PageProcessor * @since [产品/模块版本] */ public class JavaBoKePageProcessor implements PageProcessor { private static Connection conn=null; private static PreparedStatement ps =null; //标题和链接获取 private static String TITLEQUERY="div.post_item_body h3 a.titlelnk"; //作者 private static String AUTHORQUERY="div.post_item_foot a.lightblue "; //简介 private static String SUMMARYQUERY="div.post_item_body p.post_item_summary"; //插入sql语句 private static String insertSql ="INSERT INTO Boke (title,linke,author,authorUrl,summary)VALUES(?,?,?,?,?)"; //初始链接 private static Connection getConnection(){ if (conn==null) { conn = MySqlJdbcUtils.getOpenConnection(); } return conn; } /** * * insert操作 * * @date 2017年8月31日 * @return */ private synchronized void insertDb(List<JavaBokeModel> javaBokes){ try { ps = conn.prepareStatement(insertSql); for (JavaBokeModel javaBoke:javaBokes) { ps.setString(1, javaBoke.getTitle().toString()); ps.setString(2, javaBoke.getLinke().toString()); ps.setString(3, javaBoke.getAuthor().toString()); ps.setString(4, javaBoke.getAuthorUrl().toString()); ps.setString(5, javaBoke.getSummary().toString()); ps.executeUpdate(); } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //初始化带爬取网页地址 private static List<String> urls(){ List<String> listUrl =new ArrayList<String>(); for (int i = 2; i <=200; i++) { //listUrl.add("http://www.cnblogs.com/cate/java/"+i); listUrl.add("http://www.cnblogs.com/cate/java/"+i); } listUrl.toArray(new String[listUrl.size()]); return listUrl; } /** * * jsoup根据 html 字符串和语法获取内容; * @date 2017年8月31日 * @param htmlText * @return */ private static String seletDocumentText(String htmlText,String Query){ Document doc = Jsoup.parse(htmlText); String select = doc.select(Query).text(); return select; } /** * * jsoup根据 html 字符串和语法获取链接地址; * @date 2017年8月31日 * @param htmlText * @return */ private static String seletDocumentLink(String htmlText,String Query){ Document doc = Jsoup.parse(htmlText); String select = doc.select(Query).attr("href"); return select; } /** * process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 * @see us.codecraft.webmagic.processor.PageProcessor#process(us.codecraft.webmagic.Page) */ @Override public void process(Page page) { // page.addTargetRequests(urls()); //div[@class=‘post_item‘]//div[@class=‘post_item_body‘]//h3//a[@class=‘titlelnk‘]/text()‘ // 定义如何抽取页面信息,并保存下来 List<String> htmls =page.getHtml().xpath("//div[@class=‘post_item‘]/html()").all(); List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>(); for (String html:htmls) { JavaBokeModel javaBoke =new JavaBokeModel(); //标题和链接 String title =seletDocumentText(html,TITLEQUERY); String linke =seletDocumentLink(html,TITLEQUERY); //作者和作者主页 String author=seletDocumentText(html, AUTHORQUERY); String authorUrl=seletDocumentLink(html, AUTHORQUERY); //简介 String summary=seletDocumentText(html, SUMMARYQUERY); javaBoke.setTitle(title); javaBoke.setAuthor(author); javaBoke.setAuthorUrl(authorUrl); javaBoke.setLinke(linke); javaBoke.setSummary(summary); javaBokes.add(javaBoke); } insertDb(javaBokes); } @Override public Site getSite() { //抓去网站的相关配置包括:编码、重试次数、抓取间隔 return Site.me().setSleepTime(1000).setRetryTimes(10); } public static void main(String[] args) { long startTime ,endTime; System.out.println("========小爬虫【启动】喽!========="); getConnection(); startTime = new Date().getTime(); //入口 Spider create = Spider.create(new JavaBoKePageProcessor()); //定义入口地址 create.addUrl("http://www.cnblogs.com/cate/java/").thread(5).run(); try { ps.close(); conn.close(); } catch (Exception e) { // TODO: handle exception } endTime = new Date().getTime(); System.out.println("========小爬虫【结束】喽!========="); System.out.println("用时为:"+(endTime-startTime)/1000+"s"); } }
数据:
标签:快速 标题 attr att 间隔 sts blank jsoup tor
原文地址:https://www.cnblogs.com/aibabel/p/11017558.html