CrawlerPicture.java 文件
package com.lym.crawlerDemo; import java.io.DataInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import com.lym.mode.Picture; /** * 从 http://m.qqba.com/ 爬取美女图片 * @author Administrator * */ public class CrawlerPicture { public final static int STARTPAGE = 301; public final static int ENDPAGE = 500;//爬取的页面数量 /** * 获取图片的src和alt属性值 * @return * @throws IOException */ public static List<Picture> getPictureUrl() throws IOException{ int number = 1; List<Picture> pics = new ArrayList<Picture>();//存储获取到的所有图片的URL地址 for (int i = STARTPAGE; i < ENDPAGE; i++) { String url = "http://m.qqba.com/people/list/"+i+".htm"; Document doc = null; doc = Jsoup.connect(url).get(); //获取页面文档 Elements divList = doc.body().select("div.image-cell"); for (int j = 0; j < divList.size(); j++) { Elements imgList = divList.get(j).select("img");//一个网页内所有的img标签 for (int k = 0; k < imgList.size(); k++) { Picture pic = new Picture(); pic.setId(number++); pic.setSrc(imgList.get(k).attr("src")); pic.setAlt(imgList.get(k).attr("alt")); pics.add(pic); } } } return pics; } /** * 获取图片输入流 * @param picUrl 图片的URL地址 * @return * @throws IOException */ public static InputStream getPictureInputStream(String picUrl) throws IOException{ URL url = new URL(picUrl); DataInputStream dis = new DataInputStream(url.openStream());//获取图片的输入流 return dis; } /** * 保存图片到本地磁盘中 * @param number 图片编号 * @throws IOException */ public static void savePicture(InputStream in, Picture pic) throws IOException{ String newImgUrl = "D:/picture/"+pic.getAlt()+"--"+pic.getId()+".jpg";//图片在磁盘上的存储路径 FileOutputStream fos = new FileOutputStream(new File(newImgUrl)); byte[] buf = new byte[1024]; int len = -1; while( (len = in.read(buf)) >0){ fos.write(buf, 0, len); } fos.close(); } /** * 测试 * @param args */ public static void main(String[] args) { try { List<Picture> pics = getPictureUrl(); System.out.println("图片正在下载..."); for (int i = 0; i < pics.size(); i++) { Picture pic = pics.get(i); String picUrl = pic.getSrc(); InputStream in = getPictureInputStream(picUrl); savePicture(in, pic); in.close(); } System.out.println("下载完成!"); } catch (IOException e) { e.printStackTrace(); } } }
Picture.java文件
package com.lym.mode; public class Picture { /** * 图片编号 */ private int id; /** * 图片地址 */ private String src; /** * 图片说明信息 */ private String alt; public int getId() { return id; } public void setId(int id) { this.id = id; } public String getSrc() { return src; } public void setSrc(String src) { this.src = src; } public String getAlt() { return alt; } public void setAlt(String alt) { this.alt = alt; } @Override public String toString() { return "Picture [id=" + id + ", src=" + src + ", alt=" + alt + "]"; } }
版权声明:本文为博主原创文章,未经博主允许不得转载。
原文地址:http://blog.csdn.net/u014740338/article/details/47090245