标签:roo nts span text document throws etc ++ https
jsoup爬取文章内容
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub //response.getWriter().append("Served at: ").append(request.getContextPath()); String agent1 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"; int pageNum=1; int pageSize=899; //for(pageNum=1;pageNum<101;pageNum++) for(pageNum=1;pageNum<2;pageNum++) { try { int page1= 277; Map<Integer,String> map1 = ManageMySQL.getNewsLinkInTable(page1,pageSize,"data_szyjglj"); for(Integer key : map1.keySet()) { System.out.println(key+" "+map1.get(key)); String news_link = map1.get(key); String context1=""; String source1=""; //String context1 = getContentByURL(news_link).replace(" ", ""); Document documentRoot = Jsoup.connect(news_link).userAgent(agent1).get(); Elements elements1 = documentRoot.select("div.source span"); if(elements1.size()==2) { Element span_ele = elements1.get(0); source1 = span_ele.text(); } Elements elements2 = documentRoot.select("div.view_box"); if(elements2.size()==1) { Element div_ele = elements2.get(0); context1 = div_ele.text(); } ManageMySQL.updateContextAndPublishDate(key, context1.replace("‘", "").replace("\"", ""),source1,"data_szyjglj"); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
标签:roo nts span text document throws etc ++ https
原文地址:https://www.cnblogs.com/herd/p/11722013.html