标签:html soc mil interface list esc java 源代码 down
package com.tl.spider.parser.impl; import com.tl.spider.download.WebPageDownLoadUtil; import com.tl.spider.parser.interfaces.ParseFieldsInterface; import com.tl.spider.pojos.ParserResultEntity; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.Date; import java.util.List; /** * @ClassName ParseFields4Xpath * @Description 使用Jsoup实现解析类 * @Author Administrator * @Date 2019/5/19 16:09 * @Version 1.0 **/ public class ParseFields4Xpath implements ParseFieldsInterface { @Override public List<ParserResultEntity> parseHtml(String htmlContent) { Document doc = Jsoup.parse(htmlContent); // 从字符串中输入 HTML 文档 Element element = doc.select("ul.tj3_1").first(); Elements elements = element.select("li"); List<ParserResultEntity> results = new ArrayList<>(); Date currentData = new Date(System.currentTimeMillis()); for(Element e : elements) { ParserResultEntity obj = new ParserResultEntity(); obj.setTitle(e.select("a").text()); obj.setPostDate(e.select("font").text()); obj.setInsertDate(currentData.toString()); results.add(obj); } /* // css语法提取标题和时间 System.out.println(doc.select("body > div.main > div.main_l > div.rdwz > ul > li:nth-child(1) > a").text()); System.out.println(doc.select("body > div.main > div.main_l > div.rdwz > ul > li:nth-child(1) > font").text()); */ return results; } public static void main(String[] args) throws Exception { String url = "http://news.youth.cn/gn/"; String charSet = "gb2312"; // 这个地方的编码可以通过查看网页源代码的meta charset得到 String content = WebPageDownLoadUtil.getHtmlSourceBySocket(url, charSet); System.out.println(content.length()); ParseFields4Xpath parseFields4Xpath = new ParseFields4Xpath(); List<ParserResultEntity> results = parseFields4Xpath.parseHtml(content); for(ParserResultEntity message : results) { System.out.println(message.toString()); } } }
标签:html soc mil interface list esc java 源代码 down
原文地址:https://www.cnblogs.com/wylwyl/p/10890050.html