全站克隆是一件很有意思的事情,需要满足许多条件。
需要保证文件能够静态访问,这就要求html文件中的路径都是相对路径。
涉及到html文件的链接改写过程,因为不改写链接,本地访问时还是会加载网站上的资源。
一个大坑:如果没有禁用重定向,会产生死循环。
你访问a页面,a包含b,b被重定向到a,那么就会产生a/a/a/a/a...
这种情况,最简单的解决方法是禁用掉重定向,麻烦点的方法时检测出循环来,如果出现路径循环则停止。
关键在于,重定向之后url变成了错误的url。
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>wyf</groupId>
<artifactId>CloneSite</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!-- https://mvnrepository.com/artifact/cn.edu.hfut.dmic.webcollector/WebCollector -->
<dependency>
<groupId>cn.edu.hfut.dmic.webcollector</groupId>
<artifactId>WebCollector</artifactId>
<version>2.71</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>utf-8</encoding>
</configuration>
</plugin>
</plugins>
</build>
</project>
import cn.edu.hfut.dmic.webcollector.conf.Configuration;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
public class Main extends BreadthCrawler {
static String seed = "http://www.xqbase.com/computer.htm";
//html网页前缀
static String prefix = "http://www.xqbase.com/computer";
static Path targetFolder = Paths.get("haha").toAbsolutePath();
int maxRedirect = 0;
public Main(String crawlPath, boolean autoParse) {
super(crawlPath, autoParse);
this.getConf().setMaxRedirect(maxRedirect);
this.addSeed(seed);
}
boolean isInvalidPathChar(char var0) {
return var0 < ' ' || "<>:\"|?*".indexOf(var0) != -1;
}
boolean isInvalidPath(String path) {
for (int i = 0; i < path.length(); i++) if (isInvalidPathChar(path.charAt(i))) return true;
return false;
}
/**
* 将URL转化为本地的path,用于将网页内容保存到本地
*
* @param url:绝对路径url
* @param type:文件类型,用于决定保存成的后缀名称
*/
Path url2path(String url, String type) {
int beg = url.indexOf(":") + 3;
String path = url.substring(beg);
//如果文件名包含不合法字符,那么使用hashcode
if (isInvalidPath(path)) {
path = path.hashCode() + "";
}
if (type != null && !path.endsWith("." + type)) {
path += '.' + type;
}
return targetFolder.resolve(path);
}
/**
* now表示当前html网页url,resource表示资源文件url,返回二者的相对位置
* resourceType表示是否强制resourceURL发生变化
*/
String path2relative(String htmlUrl, String resourceUrl, String resourceType) {
return url2path(htmlUrl, "html").getParent().relativize(url2path(resourceUrl, resourceType)).toString().replace('\\', '/');
}
/**
* 递归创建目录,用于创建文件
*/
void mkdir(Path p) {
p = p.toAbsolutePath();
if (Files.exists(p)) return;
if (Files.notExists(p.getParent())) mkdir(p.getParent());
try {
Files.createDirectory(p);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 保存文本文件
*/
void writeFile(Path path, String content, Charset encoding) {
mkdir(path.getParent());
try (BufferedWriter cout = Files.newBufferedWriter(path, encoding)) {
cout.write(content);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 保存二进制文件
*/
void writeFile(Path path, byte[] data) {
mkdir(path.getParent());
try (OutputStream cout = Files.newOutputStream(path)) {
cout.write(data);
} catch (Exception e) {
e.printStackTrace();
}
}
void src(Page page, CrawlDatums crawlDatums, Document doc) {
String src[] = new String[]{"script", "svg", "img"};
for (int ind = 0; ind < src.length; ind++) {
String j = src[ind];
for (Element i : doc.select(j)) {
if (i.hasAttr("src") == false) continue;
String s = i.absUrl("src");
if (s.trim().length() == 0) continue;
i.attr("src", path2relative(page.url(), s, null));
CrawlDatum next = new CrawlDatum(s, "binary");
crawlDatums.add(next);
}
}
}
void hrefOfResource(Page page, CrawlDatums crawlDatums, Document doc) {
String href[] = new String[]{"link"};
for (int ind = 0; ind < href.length; ind++) {
String j = href[ind];
for (Element i : doc.select(j)) {
if (i.hasAttr("href") == false) continue;
String s = i.absUrl("href");
if (s.trim().length() == 0) continue;
i.attr("href", path2relative(page.url(), s, null));
CrawlDatum next = new CrawlDatum(s, "binary");
crawlDatums.add(next);
}
}
}
void hrefOfHtml(Page page, CrawlDatums crawlDatums, Document doc) {
for (Element i : doc.select("a")) {
if (i.hasAttr("href")) {
String s = i.absUrl("href");
if (s.trim().length() == 0) continue;
i.attr("href", path2relative(page.url(), s, "html"));
if (s.startsWith(prefix)) {
crawlDatums.add(s);
}
}
}
}
@Override
public void visit(Page page, CrawlDatums crawlDatums) {
if (page.matchType("binary")) {
writeFile(url2path(page.url(), null), page.content());
} else {
Document doc = page.doc();
src(page, crawlDatums, doc);
hrefOfResource(page, crawlDatums, doc);
hrefOfHtml(page, crawlDatums, doc);
writeFile(url2path(page.url(), "html"), doc.html(), doc.charset());
}
}
public static void main(String[] args) throws Exception {
//autoparse表示是否让引擎来控制url的解析
Main blog = new Main("webcollector", false);
Configuration conf = blog.getConf();
conf.setConnectTimeout(3000);
blog.start(Integer.MAX_VALUE);
}
}