基于WebCollector的全站克隆工具

时间：2018-01-27 11:43:03 阅读：367 评论：0 收藏：0 [点我收藏+]

标签：tac buffered html com start raw doc bst att

全站克隆是一件很有意思的事情，需要满足许多条件。

需要保证文件能够静态访问，这就要求html文件中的路径都是相对路径。

涉及到html文件的链接改写过程，因为不改写链接，本地访问时还是会加载网站上的资源。

一个大坑：如果没有禁用重定向，会产生死循环。
你访问a页面，a包含b，b被重定向到a，那么就会产生a/a/a/a/a...
这种情况，最简单的解决方法是禁用掉重定向，麻烦点的方法时检测出循环来，如果出现路径循环则停止。
关键在于，重定向之后url变成了错误的url。

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>wyf</groupId>
    <artifactId>CloneSite</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <!-- https://mvnrepository.com/artifact/cn.edu.hfut.dmic.webcollector/WebCollector -->
        <dependency>
            <groupId>cn.edu.hfut.dmic.webcollector</groupId>
            <artifactId>WebCollector</artifactId>
            <version>2.71</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.2</version>
        </dependency>

    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>utf-8</encoding>
                </configuration>
            </plugin>
        </plugins>
    </build>

</project>

import cn.edu.hfut.dmic.webcollector.conf.Configuration;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

public class Main extends BreadthCrawler {
static String seed = "http://www.xqbase.com/computer.htm";
//html网页前缀
static String prefix = "http://www.xqbase.com/computer";
static Path targetFolder = Paths.get("haha").toAbsolutePath();
int maxRedirect = 0;

public Main(String crawlPath, boolean autoParse) {
    super(crawlPath, autoParse);
    this.getConf().setMaxRedirect(maxRedirect);
    this.addSeed(seed);
}

boolean isInvalidPathChar(char var0) {
    return var0 < ' ' || "<>:\"|?*".indexOf(var0) != -1;
}

boolean isInvalidPath(String path) {
    for (int i = 0; i < path.length(); i++) if (isInvalidPathChar(path.charAt(i))) return true;
    return false;
}

/**
 * 将URL转化为本地的path，用于将网页内容保存到本地
 *
 * @param url：绝对路径url
 * @param type:文件类型，用于决定保存成的后缀名称
 */

Path url2path(String url, String type) {
    int beg = url.indexOf(":") + 3;
    String path = url.substring(beg);

    //如果文件名包含不合法字符，那么使用hashcode
    if (isInvalidPath(path)) {
        path = path.hashCode() + "";
    }
    if (type != null && !path.endsWith("." + type)) {
        path += '.' + type;
    }
    return targetFolder.resolve(path);
}

/**
 * now表示当前html网页url，resource表示资源文件url，返回二者的相对位置
 * resourceType表示是否强制resourceURL发生变化
 */
String path2relative(String htmlUrl, String resourceUrl, String resourceType) {
    return url2path(htmlUrl, "html").getParent().relativize(url2path(resourceUrl, resourceType)).toString().replace('\\', '/');
}

/**
 * 递归创建目录，用于创建文件
 */

void mkdir(Path p) {
    p = p.toAbsolutePath();
    if (Files.exists(p)) return;
    if (Files.notExists(p.getParent())) mkdir(p.getParent());
    try {
        Files.createDirectory(p);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

/**
 * 保存文本文件
 */
void writeFile(Path path, String content, Charset encoding) {
    mkdir(path.getParent());
    try (BufferedWriter cout = Files.newBufferedWriter(path, encoding)) {
        cout.write(content);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

/**
 * 保存二进制文件
 */
void writeFile(Path path, byte[] data) {
    mkdir(path.getParent());
    try (OutputStream cout = Files.newOutputStream(path)) {
        cout.write(data);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

void src(Page page, CrawlDatums crawlDatums, Document doc) {
    String src[] = new String[]{"script", "svg", "img"};
    for (int ind = 0; ind < src.length; ind++) {
        String j = src[ind];
        for (Element i : doc.select(j)) {
            if (i.hasAttr("src") == false) continue;
            String s = i.absUrl("src");
            if (s.trim().length() == 0) continue;
            i.attr("src", path2relative(page.url(), s, null));
            CrawlDatum next = new CrawlDatum(s, "binary");
            crawlDatums.add(next);
        }
    }
}

void hrefOfResource(Page page, CrawlDatums crawlDatums, Document doc) {
    String href[] = new String[]{"link"};
    for (int ind = 0; ind < href.length; ind++) {
        String j = href[ind];
        for (Element i : doc.select(j)) {
            if (i.hasAttr("href") == false) continue;
            String s = i.absUrl("href");
            if (s.trim().length() == 0) continue;
            i.attr("href", path2relative(page.url(), s, null));
            CrawlDatum next = new CrawlDatum(s, "binary");
            crawlDatums.add(next);
        }
    }
}

void hrefOfHtml(Page page, CrawlDatums crawlDatums, Document doc) {
    for (Element i : doc.select("a")) {
        if (i.hasAttr("href")) {
            String s = i.absUrl("href");
            if (s.trim().length() == 0) continue;
            i.attr("href", path2relative(page.url(), s, "html"));
            if (s.startsWith(prefix)) {
                crawlDatums.add(s);
            }
        }
    }
}

@Override
public void visit(Page page, CrawlDatums crawlDatums) {
    if (page.matchType("binary")) {
        writeFile(url2path(page.url(), null), page.content());
    } else {
        Document doc = page.doc();
        src(page, crawlDatums, doc);
        hrefOfResource(page, crawlDatums, doc);
        hrefOfHtml(page, crawlDatums, doc);
        writeFile(url2path(page.url(), "html"), doc.html(), doc.charset());
    }
}

public static void main(String[] args) throws Exception {
    //autoparse表示是否让引擎来控制url的解析
    Main blog = new Main("webcollector", false);
    Configuration conf = blog.getConf();
    conf.setConnectTimeout(3000);
    blog.start(Integer.MAX_VALUE);
}
}

基于WebCollector的全站克隆工具

标签：tac buffered html com start raw doc bst att

原文地址：https://www.cnblogs.com/weidiao/p/8364813.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行