标签:12px 自己 i++ 关闭 就是 clipboard location links baidu
1.maven依赖
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhaowu</groupId>
<artifactId>pachong01</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.quartz-scheduler/quartz -->
<dependency>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz</artifactId>
<version>2.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/cn.edu.hfut.dmic.webcollector/WebCollector -->
<dependency>
<groupId>cn.edu.hfut.dmic.webcollector</groupId>
<artifactId>WebCollector</artifactId>
<version>2.71</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.17</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.29</version>
</dependency>
</dependencies>
</project>
2.建立项目
建立两个java文件。
第一部分,HtmlUnitforBD.java:主要实现摘取百度搜索的URL链接;
第二部分,transURLtoINFO.java:摘取链接的具体内容。
3.观察网页内容
观察网页源码:
3.1百度输入框参数:id=kw
3.2“百度一下”的按钮参数:id=su
3.3执行搜索“习大大”之后的网页源码,可以发现搜索的结果里面几乎都包含带有data-click属性的<div>标签,就是要把他们全提取出来,另外某些结果的属性是“mu”的,因为含这个属性的<div>标签比较少,本人没有做,有兴趣的可以试着改改。
3.4看其他页的代码,找规律获取所有页的地址
规律就是如图:,pn=1(第二页),pn=2(第三页)...,并且其他的部分是相同的,也就是是说,直接替换掉数字就可以定位到该页。
4.好,来代码!
(第一部分中有两处try catch中注释掉的代码,可以取消注释,这样能够查看从网页获取的文本内容。程序执行过程中存在找不到网页返回504等错误,很少碰见,如果出现,可以稍等一下,程序给出反馈后继续执行。)
第一部分(获取链接的部分):
- package bdsearch;
-
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.ArrayList;
-
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
-
- import com.gargoylesoftware.htmlunit.BrowserVersion;
- import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
- import com.gargoylesoftware.htmlunit.WebClient;
- import com.gargoylesoftware.htmlunit.html.HtmlInput;
- import com.gargoylesoftware.htmlunit.html.HtmlPage;
-
-
- public class HtmlUnitforBD {
- private static int N = 3;
- private static String keyW = "习大大";
- private static HtmlPage firstBaiduPage;
- private static String format = "";
- private static ArrayList<String> eachurl = new ArrayList<String>();
-
- public static void main(String[] args) throws Exception {
- mainFunction(N, keyW);
- }
-
- public static void mainFunction(final int n, final String keyWord) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
- Thread thread = new Thread(new Runnable() {
- @Override
- public void run() {
- int x = n;
- System.out.println("要提取百度关于“" + keyWord + "”搜索结果的前" + x + "页");
-
- Elements firstPageURL = null;
- try {
- firstPageURL = getFirstPage(keyWord);
- } catch (FailingHttpStatusCodeException | IOException e) {
- e.printStackTrace();
- }
- for (Element newlink : firstPageURL) {
- String linkHref = newlink.attr("href");
- String linkText = newlink.text();
- if (linkHref.length() > 14 & linkText.length() > 2) {
- System.out.println(linkHref + "\n\t\t摘要:" + linkText);
- eachurl.add(linkHref);
-
-
-
-
-
-
-
-
-
-
-
- }
- }
-
- nextHref(firstBaiduPage);
-
- for (int i = 1; i < x; i++) {
- System.out.println("\n************百度搜索“" + keyW + "”第" + (i + 1) + "页结果************");
- String tempURL = format.replaceAll("&pn=1", "&pn=" + i + "");
- System.out.println("该页地址为:" + format.replaceAll("&pn=1", "&pn=" + i + ""));
- HtmlUnitforBD h = new HtmlUnitforBD();
- String htmls = h.getPageSource(tempURL, "utf-8");
- org.jsoup.nodes.Document doc = Jsoup.parse(htmls);
- Elements links = doc.select("a[data-click]");
- for (Element newlink : links) {
- String linkHref = newlink.attr("href");
- String linkText = newlink.text();
- if (linkHref.length() > 14 & linkText.length() > 2) {
- System.out.println(linkHref + "\n\t\t摘要:" + linkText);
- eachurl.add(linkHref);
-
-
-
-
-
-
-
-
-
-
-
- }
- }
- }
- System.out.println("\n\n\n输出所有地址");
- for (String xx : eachurl) {
- System.out.println(xx);
- }
- return;
- }
- });
- thread.start();
- }
-
-
- public static Elements getFirstPage(String w) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
-
- String word = w;
- WebClient webClient = new WebClient(BrowserVersion.CHROME);
- webClient.getOptions().setJavaScriptEnabled(false);
- webClient.getOptions().setCssEnabled(false);
- HtmlPage page = (HtmlPage) webClient.getPage("http://www.baidu.com/");// 百度搜索首页页面
- HtmlInput input = (HtmlInput) page.getHtmlElementById("kw");
- input.setValueAttribute(word);
- HtmlInput btn = (HtmlInput) page.getHtmlElementById("su");
- firstBaiduPage = btn.click();
- String WebString = firstBaiduPage.asXml().toString();
- org.jsoup.nodes.Document doc = Jsoup.parse(WebString);
- System.out.println("************百度搜索“" + word + "”第1页结果************");
- Elements links = doc.select("a[data-click]");
- return links;
- }
-
-
- public static void nextHref(HtmlPage p) {
-
-
- WebClient webClient = new WebClient(BrowserVersion.CHROME);
- webClient.getOptions().setJavaScriptEnabled(false);
- webClient.getOptions().setCssEnabled(false);
- p = firstBaiduPage;
- String morelinks = p.getElementById("page").asXml();
- org.jsoup.nodes.Document doc = Jsoup.parse(morelinks);
- Elements links = doc.select("a[href]");
- boolean getFormat = true;
- for (Element newlink : links) {
- String linkHref = newlink.attr("href");
- if (getFormat) {
- format = "http://www.baidu.com" + linkHref;// 补全模板格式
- getFormat = false;
- }
- }
- }
-
- public String getPageSource(String pageUrl, String encoding) {
-
-
- StringBuffer sb = new StringBuffer();
- try {
- URL url = new URL(pageUrl);
- BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), encoding));
- String line;
- while ((line = in.readLine()) != null) {
- sb.append(line);
- sb.append("\n");
- }
- in.close();
- } catch (Exception ex) {
- System.err.println(ex);
- }
- return sb.toString();
- }
- }
第二部分(提取可能有用的文本):
- package bdsearch;
-
- import java.io.BufferedReader;
- import java.io.ByteArrayInputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.net.ConnectException;
- import java.net.MalformedURLException;
- import java.util.ArrayList;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
-
- import com.gargoylesoftware.htmlunit.BrowserVersion;
- import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
- import com.gargoylesoftware.htmlunit.WebClient;
- import com.gargoylesoftware.htmlunit.html.HtmlPage;
-
- public class transURLtoINFO {
-
-
- private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>";
-
- private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>";
-
- private static final String regEx_html = "<[^>]+>";
-
- private static final String regEx_space = "\\s*|\t|\r|\n";
-
- public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
- String temp = null;
- trans("http://www.baidu.com/", temp);
- System.out.println("over");
- }
-
- public static String trans(String url, String info) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
-
- ArrayList<String> hrefList = new ArrayList<String>();
- WebClient webClient = new WebClient(BrowserVersion.CHROME);
- webClient.getOptions().setJavaScriptEnabled(false);
- webClient.getOptions().setCssEnabled(false);
- try {
- HtmlPage page = null;
- try {
- page = (HtmlPage) webClient.getPage(url);
- } catch (ConnectException e) {
- }
- InputStream temp = new ByteArrayInputStream(page.asText().getBytes());
- InputStreamReader isr = new InputStreamReader(temp);
- BufferedReader br = new BufferedReader(isr);
- String str = null, rs = null;
- while ((str = br.readLine()) != null) {
- rs = str;
- if (rs != null)
- hrefList.add(rs);
- }
- System.out.println("从该网址" + url + "查找的可能相关文本如下:");
- for (int i = 0; i < hrefList.size(); i++) {
- String string = hrefList.get(i);
- string = getTextFromHtml(string);
- if (string.length() >= 50) {
- info += "\n" + string;
- System.out.println(string);
- }
- }
- } catch (IOException e) {
- }
- return info;
- }
-
-
- public static String delHTMLTag(String htmlStr) {
-
- Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE);
- Matcher m_space = p_space.matcher(htmlStr);
- htmlStr = m_space.replaceAll("");
-
- Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
- Matcher m_script = p_script.matcher(htmlStr);
- htmlStr = m_script.replaceAll("");
-
- Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
- Matcher m_style = p_style.matcher(htmlStr);
- htmlStr = m_style.replaceAll("");
-
- Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
- Matcher m_html = p_html.matcher(htmlStr);
- htmlStr = m_html.replaceAll("");
-
- return htmlStr.trim();
- }
-
- public static String getTextFromHtml(String htmlStr) {
- htmlStr = delHTMLTag(htmlStr);
- htmlStr = htmlStr.replaceAll(" ", "");
- return htmlStr;
- }
- }
运行结果:
同时可以输出了所有链接,有需要的可以用此方法专门搜集链接:
如果取消注释输出的文本如下:
采集baidu搜索信息的java源代码实现(大部分转发,少量自己修改)(使用了htmlunit和Jsoup)(转发:https://blog.csdn.net/zhaohang_1/article/details/44731039)
标签:12px 自己 i++ 关闭 就是 clipboard location links baidu
原文地址:https://www.cnblogs.com/sutao/p/9009563.html