标签:
数据库创建脚本:
/* Navicat MySQL Data Transfer Source Server : localhost Source Server Version : 50621 Source Host : localhost:3306 Source Database : cmm-db Target Server Type : MYSQL Target Server Version : 50621 File Encoding : 65001 Date: 2015-05-25 19:05:58 */ SET FOREIGN_KEY_CHECKS=0; -- ---------------------------- -- Table structure for `region` -- ---------------------------- DROP TABLE IF EXISTS `region`; CREATE TABLE `region` ( `area_code` varchar(50) NOT NULL COMMENT ‘地区编码‘, `area_name` varchar(50) NOT NULL COMMENT ‘地区名称‘, `parent_code` varchar(50) NOT NULL COMMENT ‘地区父编码‘, `place_order` varchar(50) NOT NULL DEFAULT ‘0‘ COMMENT ‘显示顺序‘, PRIMARY KEY (`area_code`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT; -- ---------------------------- -- Records of region -- ----------------------------
maven依赖:
<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.18</version> </dependency>
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.8.2</version> </dependency>
源代码:
import java.io.FileWriter; import java.io.IOException; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class JsoupRegion { /** 整数 */ private static final String V_INTEGER = "^-?[1-9]\\d*$"; class Region { public String getCode() { return code; } public void setCode(String code) { this.code = code; } public String getName() { return name; } public void setName(String name) { this.name = name; } public int getParentId() { return parenId; } public void setParentId(int parenId) { this.parenId = parenId; } public String getType() { return type; } public void setType(String type) { this.type = type; } private String code; private String name; private String type;// 乡镇类型 private int parenId; } /** * @说明: url2Document * @param @param url * @param @return * @param @throws IOException * @return Document * @throws */ public static Document url2Doc(String url) throws IOException { // 此种方式403 // return Jsoup.connect(url).get(); // return Jsoup.connect(url).timeout(600 * 1000) // .userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36").get(); // return Jsoup.connect(url).timeout(600 * 1000).get(); } /** * 验证是不是整数 * * @param value * 要验证的字符串 要验证的字符串 * @return 如果是符合格式的字符串,返回 <b>true </b>,否则为 <b>false </b> */ public static boolean Integer(String value) { return match(V_INTEGER, value); } /** * @param regex * 正则表达式字符串 * @param str * 要匹配的字符串 * @return 如果str 符合 regex的正则表达式格式,返回true, 否则返回 false; */ private static boolean match(String regex, String str) { Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(str); return matcher.matches(); } /** * 追加文件:使用FileWriter * * @param fileName * @param content */ public static void appendFile(String fileName, String content) { FileWriter writer = null; try { // 打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件 writer = new FileWriter(fileName, true); writer.write(content); } catch (IOException e) { e.printStackTrace(); } finally { try { if (writer != null) { writer.close(); } } catch (IOException e) { e.printStackTrace(); } } } /** * @说明: 获取省份 * @param @param url * @param @return * @param @throws IOException * @return List<Region> * @throws */ private static List<Region> getProvince(String url) throws IOException { List<Region> list = new ArrayList<Region>(); Document doc = url2Doc(url); Elements proviceTr = doc.getElementsByAttributeValue("class", "provincetr");// 通过css获取tr for (Element e : proviceTr) { Elements tds = e.select("a[href]"); for (Element element : tds) { Region region = new JsoupRegion().new Region(); // region.setCode("13"); region.setCode(element.attr("href").substring(0, 2)); region.setName(element.text().replaceAll("<br />", "")); region.setType(""); region.setParentId(0); list.add(region); } } return list; } /** * @说明: 获取省份下的市 * @param @param url * @param @return * @param @throws IOException * @return List<Region> * @throws */ private static List<Region> getCity(String url) throws IOException { List<Region> list = new ArrayList<Region>(); Document doc = url2Doc(url); Elements proviceTr = doc.getElementsByAttributeValue("class", "citytr");// 通过css获取tr for (Element e : proviceTr) { Elements tds = e.select("a[href]"); for (Element element : tds) { if (Integer(element.text())) { continue; } Region region = new JsoupRegion().new Region(); String code = element.attr("href").substring(3, 7); region.setCode(code); region.setName(element.text()); region.setParentId(Integer.valueOf(code.substring(0, 2))); region.setType(""); list.add(region); } } return list; } /** * @说明: 县 * @param @param url * @param @return * @param @throws IOException * @return List<Region> * @throws */ private static List<Region> getCounty(String url) throws IOException { List<Region> list = new ArrayList<Region>(); Document doc = url2Doc(url); Elements proviceTr = doc.getElementsByAttributeValue("class", "countytr");// 通过css获取tr for (Element e : proviceTr) { Elements tds = e.select("a[href]"); for (Element element : tds) { if (Integer(element.text())) { continue; } Region region = new JsoupRegion().new Region(); String code = element.attr("href").substring(3, 9); region.setCode(code); region.setName(element.text()); region.setParentId(Integer.valueOf(code.substring(0, 4))); region.setType(""); list.add(region); } } return list; } /** * @说明: 镇 * @param @param url * @param @return * @param @throws IOException * @return List<Region> * @throws */ private static List<Region> getTown(String url) throws IOException { List<Region> list = new ArrayList<Region>(); Document doc = url2Doc(url); Elements proviceTr = doc.getElementsByAttributeValue("class", "towntr");// 通过css获取tr for (Element e : proviceTr) { Elements tds = e.select("a[href]"); for (Element element : tds) { if (Integer(element.text())) { continue; } Region region = new JsoupRegion().new Region(); String code = element.attr("href").substring(3, 12); region.setCode(code); region.setName(element.text()); region.setParentId(Integer.valueOf(code.substring(0, 6))); region.setType(""); list.add(region); } } return list; } /** * @说明: 村 * @param @param url * @param @return * @param @throws IOException * @return List<Region> * @throws */ private static List<Region> getVillage(String url) throws IOException { List<Region> list = new ArrayList<Region>(); Document doc = url2Doc(url); Elements proviceTr = doc.getElementsByAttributeValue("class", "villagetr");// 通过css获取tr for (Element e : proviceTr) { Elements trs = e.select("tr"); for (Element element : trs) { Elements tds = element.select("td"); Region region = new JsoupRegion().new Region(); for (Element element2 : tds) { String value = element2.text(); if (Integer(value) && value.length() == 3) { region.setType(element2.text()); } if (Integer(value) && value.length() > 3) { region.setCode(value); region.setParentId(Integer.valueOf(value.substring(0, 9))); } else { region.setName(value); } } list.add(region); } } return list; } public static void insertDb() { } /** * @Description: * @param * @return Connection 返回类型 * @throws */ private static Connection connection() throws ClassNotFoundException, SQLException { // 调用Class.forName()方法加载驱动程序 Class.forName("com.mysql.jdbc.Driver"); String url = "jdbc:mysql://localhost:3306/cmm-db?useUnicode=true&characterEncoding=utf-8&autoReconnect=true"; // JDBC的URL Connection conn = DriverManager.getConnection(url, "root", "root"); return conn; } public static void insertRegion(Region region) throws ClassNotFoundException, SQLException { Connection conn = connection(); conn.setAutoCommit(false); // 插入数据的代码 String sql2 = "insert into region(area_code,area_name,parent_code,place_order) values(?,?,?,?) "; PreparedStatement pst = conn.prepareStatement(sql2); pst.setString(1, region.getCode()); pst.setString(2, region.getName()); pst.setString(3, region.getParentId()+""); pst.setString(4, region.getCode()); pst.addBatch(); // 执行批量更新 pst.executeBatch(); // 语句执行完毕,提交本事务 conn.commit(); pst.close(); conn.close(); } public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException { String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/"; List<Region> all = new ArrayList<Region>(); List<Region> province = getProvince(url + "/index.html"); all.addAll(province); List<String> done = new ArrayList<String>();// 用来存已经抓取过的省份 for (Region regionProvince : province) {// 遍历省 if (done.contains(regionProvince.getCode())) { continue; } insertRegion(regionProvince); System.out.println(regionProvince.getCode() + regionProvince.getName()); List<Region> city = getCity(url + regionProvince.getCode() + ".html"); for (Region regionCity : city) {// 遍历市 insertRegion(regionCity); System.out.println(regionCity.getCode() + "||" + regionCity.getName()); List<Region> county = getCounty(url + regionProvince.getCode() + "/" + regionCity.getCode() + ".html"); all.addAll(county); for (Region regionCounty : county) {// 遍历县 insertRegion(regionCounty); System.out.println(regionCounty.getCode() + "||" + regionCounty.getName()); List<Region> town = getTown(url + regionProvince.getCode() + "/" + regionCity.getCode().substring(2, 4) + "/" + regionCounty.getCode() + ".html"); all.addAll(town); for (Region regionTown : town) {// 遍历镇 insertRegion(regionTown); System.out.println(regionTown.getCode() + "||" + regionTown.getName()); List<Region> village = getVillage(url + regionProvince.getCode() + "/" + regionCity.getCode().substring(2, 4) + "/" + regionCounty.getCode().substring(4, 6) + "/" + regionTown.getCode() + ".html"); all.addAll(village); for (Region regionVillage : village) {// 遍历村 insertRegion(regionVillage); System.out.println(regionVillage.getCode() + "||" + regionVillage.getName()); } } } } } } }
标签:
原文地址:http://my.oschina.net/ydsakyclguozi/blog/419740