码迷,mamicode.com
首页 > 编程语言 > 详细

java爬虫,网页简易爬小说程序

时间:2020-03-16 14:28:41      阅读:86      评论:0      收藏:0      [点我收藏+]

标签:sel   bsp   charset   otf   print   param   txt   图片   https   

package PaChong;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;

import java.io.BufferedOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Scanner;


public class Main {
    /**
     * @param strURL(链接地址)
     * @return
     */
    public static void is(String strURL) {
        is(strURL, "utf-8");
    }

    /**
     * @param strURL(链接地址)
     * @param charset(字符编码)
     * @return(返回字符串)
     */
    public static void is(String strURL, String charset) {
        getContentFromUrl(strURL, charset);
    }

    /**
     * @param myUrl(链接地址)
     * @param charset(字符编码)
     * @return (返回字符串)
     */
    public static void getContentFromUrl(String myUrl, String charset) {


        int start = 7449572;
        int end = 7450351;

        //
        String text = "";
        Scanner sc = null;
        InputStream is = null;
        BufferedOutputStream bs = null;

        try {
            bs = new BufferedOutputStream(new FileOutputStream("1234.txt", true));
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }

        System.out.println("开始");
        for (int q = start; q <= end; q++) {
            //地址
            String UrlAddress = "";
            try {
                StringBuffer sb = new StringBuffer();

                UrlAddress = "https://www.biquges.com/11_11744/" + q + ".html";
//                UrlAddress = "http://www.022003.com/9_9198/"+q+".html";
                URL url = new URL(UrlAddress);
                URLConnection urlConnection = url.openConnection();
                is = urlConnection.getInputStream();

//                byte [] b = new byte[65535];
//                int len;
//                while((len = is.read(b)) != -1){
//                    bs.write(b,0,len);
//                    System.out.println(new String(b,0,len,"UTF-8"));
//                }
                sc = new Scanner(is, charset);
                while (sc.hasNextLine()) {
                    sb.append(sc.nextLine()).append("\r\n");
                }
                Document document = Jsoup.parseBodyFragment(sb.toString());
                Element bookname = document.getElementsByTag("h1").get(0);
                bs.write(bookname.text().getBytes());
                System.out.println(bookname.text());
                Element content = document.getElementById("content");
                List<Node> nodes = content.childNodes();
                for (int i = 0; i < nodes.size() - 1; i++) {
                    Node cc = nodes.get(i);
                    if (cc != null && !" ".equals(cc.toString()) && !"<br>".equals(cc.toString())) {
//                        System.out.println(cc.toString().replaceAll("&nbsp;", ""));
                        bs.write(cc.toString().replaceAll("&nbsp;", "").getBytes());
                        bs.write("\r\n".getBytes());
                    }
                }
                bs.write("\r\n".getBytes());

            } catch (Exception e) {
                System.out.println(UrlAddress + "====》》》访问异常。。。");
                continue;
            }
        }
        if (sc != null) {
            sc.close();
        }
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        if (bs != null) {
            try {
                bs.close();
            } catch (IOException e) {
            }
        }
        System.out.println("结束");

    }

    public static void main(String[] args) {
        /*
         * 使用示例
         */
        Main.is("");
        /*
         * 使用示例
         */
//        Main.is("https://www.biquges.com/11_11744/7449454.html", "utf-8");
    }
}

结果:

技术图片

 

java爬虫,网页简易爬小说程序

标签:sel   bsp   charset   otf   print   param   txt   图片   https   

原文地址:https://www.cnblogs.com/pastjx/p/12503422.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!