一、配置mvn依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.1.2</version>
</dependency>
二、代码
1、获取网页内容
package com.chenanyi.fuli.Helper; import java.io.IOException; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; public class GetHHH { /** * 根据URL抓取网页内容 此类要用到HttpClient组件 * @author 陈安一 * @param url * @return */ public static String getContentFormUrl(String url) { /* 实例化一个HttpClient客户端 */ HttpClient client = new DefaultHttpClient(); HttpGet getHttp = new HttpGet(url); String content = null; HttpResponse response; try { /*获得信息载体*/ response = client.execute(getHttp); HttpEntity entity = response.getEntity(); if (entity != null) { /* 转化为文本信息 */ content = EntityUtils.toString(entity); } }catch (Exception e) { e.printStackTrace(); } finally { client.getConnectionManager().shutdown(); } return content; } }
2、获取页面列表内所有标题的Url
package com.chenanyi.fuli.Helper; import java.util.ArrayList; import java.io.StringReader; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegContent { /** * @author 陈安一 * @功能 根据正则表达式匹配返回的网页信息 * @param reg * @param info * @return List<String> */ public static List<String> GetCon(String reg,String info){ List<String> result=new ArrayList<String>(); Matcher m = Pattern.compile(reg).matcher(info); while (m.find()) { String r = m.group(); result.add(r); } return result; } public static String GetDiv(String info){ SAXReader reader = new SAXReader(); Document doc; try { doc = reader.read(new StringReader(info)); Node node = doc.selectSingleNode("//body/div/div/div"); System.out.println(node.getText()); } catch (DocumentException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally{ } return info; } public static String GetOneCon(String reg,String info){ String result=info; Matcher m = Pattern.compile(reg).matcher(info); while (m.find()) { result = m.group(); } return result; } /** * @author 陈安一 * @功能 根据GetCon方法返回的List列表对数据进行重组,返回一个URL * @param result * @return */ public static List<String> GetallURL(List<String> result){ for(int i=0;i<result.size();i++){ result.set(i, "http://www.laossee.com/"+result.get(i)+".html"); } return result; } }
3、将内容保存到电脑中
package com.chenanyi.fuli.Helper; import java.io.FileWriter; import java.io.IOException; public class SaveTxt { /** * @author 陈安一 * @功能 将小说保存到本地中 * @param url * @param title 例如"noexists.txt" * @param cont * @return */ public static void Sava(String title,String cont){ FileWriter fileWriter = null; try { fileWriter = new FileWriter(title,true); fileWriter.write(cont); fileWriter.flush();; } catch (IOException e) { e.printStackTrace(); }finally{ try { fileWriter.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
4、图片类的处理--下载图片保存到本地
package com.chenanyi.fuli.Helper; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; public class Download { /** * @author 陈安一 * @功能 根据url,保存路径,count(保存的标题,直接以数字保存) */ public static void down(String url, String path,int count) { // 构造URL URL img_url; URLConnection con; try { img_url = new URL(url); con = img_url.openConnection(); // 设置请求超时为5s con.setConnectTimeout(5 * 1000); // 输入流 InputStream is = con.getInputStream(); // 1K的数据缓冲 byte[] bs = new byte[1024]; // 读取到的数据长度 int len; // 输出的文件流 File sf = new File(path); if (!sf.exists()) { sf.mkdirs(); } String filename = count+".jpg"; OutputStream os; try { os = new FileOutputStream(sf.getPath() + "\\" + filename); // 开始读取 while ((len = is.read(bs)) != -1) { os.write(bs, 0, len); } // 完毕,关闭所有链接 os.close(); is.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } catch (MalformedURLException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
5、根据获取的url处理返回的html代码,提取小说或者图片保存到文件中
(1)、获取txt的
package com.chenanyi.fuli.NBHelp; import java.util.List; import com.chenanyi.fuli.Helper.GetHHH; import com.chenanyi.fuli.Helper.RegContent; import com.chenanyi.fuli.Helper.SaveTxt; public class GetTxT { /** * @author 陈安一 * @param cate 分类,16是RQ * @param pagecount 爬取得总页数 */ public static void Gettxt(int cate,int pagecount) { for (int m = 1; m < pagecount; m++) { int count = 0; // article-list-id-16-page- 16是小说- RQ小说。 // 6是图片---ZPTP String url = "http://www.laossee.com/article-list-id-"+cate+"-page-" + m + ".html"; String info = GetHHH.getContentFormUrl(url); String reg = "article-show-id-\\d{6}"; List<String> result = RegContent.GetallURL(RegContent.GetCon(reg, info)); for (int i = 0; i < result.size(); i++) { String cont = GetHHH.getContentFormUrl(result.get(i)); List<String> titles = RegContent.GetCon("<title>.*?</title>", cont); String reggg = "<br />.*?<br />"; List<String> Content = RegContent.GetCon(reggg, cont); String conts = ""; for (int f = 0; f < Content.size(); f++) { conts += Content.get(f); } conts = conts.replace("<br />", ""); for (int j = 0; j < titles.size(); j++) { count++; String title = RegContent.GetOneCon(">.*?<", titles.get(j)); title = title.replace("/", "").replace(" ", ""); title = "txt/" + title.substring(1, title.length() - 1) .replace(‘(‘, ‘ ‘).trim().replace(‘)‘, ‘ ‘) .trim().replace(‘(‘, ‘ ‘).trim() .replace(‘)‘, ‘ ‘).trim() + ".txt"; SaveTxt.Sava(title, conts); System.out.println("第" + m + "页第" + count + "个" + title); } } } } }
(2)、下载图片
package com.chenanyi.fuli.NBHelp; import java.util.ArrayList; import java.util.List; import com.chenanyi.fuli.Helper.Download; import com.chenanyi.fuli.Helper.GetHHH; import com.chenanyi.fuli.Helper.RegContent; public class Getimg { /** * @author 陈安一! * @param cate 类别,6是ZpTp * @param pagecount 爬取的页数 * @return List<String> 图片链接 */ public static void Getimg(int cate, int pagecount,String path) { int count=0; for (int m = 1; m <= pagecount; m++) { // article-list-id-16-page- 16是小说- RQ小说。 // 6是图片---ZPTP String url = "http://www.laossee.com/article-list-id-" + cate + "-page-" + m + ".html"; String info = GetHHH.getContentFormUrl(url); String reg = "article-show-id-\\d{6}"; List<String> result = RegContent.GetallURL(RegContent.GetCon(reg, info)); for (int i = 0; i < result.size(); i++) { String cont = GetHHH.getContentFormUrl(result.get(i)); List<String> img_urls = RegContent.GetCon("<img src=\"(.*?)/>", cont); for (int j = 0; j < img_urls.size(); j++) { count++; String temp = img_urls.get(j).substring(10); int index = temp.indexOf("\""); temp = temp.substring(0, index); Download.down(temp, path,count); System.out.println(count+"\tOK"); } } } } /** * @author 陈安一! * @param cate 类别,6是ZpTp * @param pagecount 爬取的页数 * @return List<String> 图片链接 */ public static List<String> GetOnePageimg(int cate, int page) { List<String> img_url = new ArrayList<String>(); // article-list-id-16-page- 16是小说- RQ小说。 // 6是图片---ZPTP String url = "http://www.laossee.com/article-list-id-" + cate + "-page-" + page + ".html"; String info = GetHHH.getContentFormUrl(url); String reg = "article-show-id-\\d{6}"; List<String> result = RegContent.GetallURL(RegContent.GetCon(reg, info)); for (int i = 0; i < result.size(); i++) { String cont = GetHHH.getContentFormUrl(result.get(i)); List<String> img_urls = RegContent.GetCon("<img src=\"(.*?)/>", cont); for (int j = 0; j < img_urls.size(); j++) { String temp = img_urls.get(j).substring(10); int index = temp.indexOf("\""); temp = temp.substring(0, index); System.out.println(temp); img_url.add(temp); } } return img_url; } }
6、运行!
package com.chenanyi.fuli.start; import com.chenanyi.fuli.NBHelp.Getimg; public class Start { /** * @author 陈安一 * @see 网络爬虫 * @功能 获取***** 哈哈哈哈哈哈 * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub try { java.util.Scanner scanner = new java.util.Scanner(System.in); System.out .println("**************************************************************"); System.out .println("**************************************************************"); System.out.println("第一个参数,分类(6,7,13),小说(14,15,16)"); System.out .println("**************************************************************"); System.out.println("第二个参数,获取的总页码数,总页码数>=1"); System.out .println("**************************************************************"); System.out.println("第三个参数,保存的地址 : 格式 f:\\\\image4\\\\"); System.out .println("**************************************************************"); System.out.println("请输入第一个参数"); int value = scanner.nextInt(); System.out.println("请输入第二个参数"); int value1 = scanner.nextInt(); System.out.println("请输入第三个参数"); String line = scanner.next(); System.out.println("开始执行"); Getimg.Getimg(value, value1, line); System.out.println("执行完毕"); } catch (Exception e) { e.printStackTrace(); } } }
7、给我评论!
本文出自 “大包子” 博客,转载请与作者联系!
原文地址:http://dabaozi.blog.51cto.com/8032088/1618506