码迷,mamicode.com
首页 > 其他好文 > 详细

selenium 爬取空间说说

时间:2018-09-23 22:36:38      阅读:218      评论:0      收藏:0      [点我收藏+]

标签:src   set   lis   selector   rup   iter   except   style   interrupt   

  1 package cn.hb.util;
  2 
  3 import java.io.File;
  4 import java.io.FileWriter;
  5 import java.io.IOException;
  6 import java.util.Set;
  7 import java.util.concurrent.TimeUnit;
  8 import org.openqa.selenium.By;
  9 import org.openqa.selenium.Cookie;
 10 import org.openqa.selenium.JavascriptExecutor;
 11 import org.openqa.selenium.Keys;
 12 import org.openqa.selenium.WebDriver;
 13 import org.openqa.selenium.WebElement;
 14 import org.openqa.selenium.firefox.FirefoxDriver;
 15 import org.openqa.selenium.firefox.FirefoxOptions;
 16 import org.openqa.selenium.interactions.Actions;
 17 
 18 /**
 19  * 爬取说说写入到txt中,爬取100条
 20  * 
 21  * @author tele
 22  *
 23  */
 24 public class QZTwitterCrawler {
 25     static String url = "https://user.qzone.qq.com/1350560858";
 26     static int maxSize = 100;
 27     static int pageSize = 20;
 28     static String userName="qq"; 
 29     static String pwd = "密码";
 30     public static void main(String[] args) throws InterruptedException, IOException {
 31         login();
 32     }
 33 
 34     /**
 35      * 登录
 36      * 
 37      * @throws InterruptedException
 38      * @throws IOException
 39      */
 40     public static void login() throws InterruptedException, IOException {
 41         System.setProperty("webdriver.gecko.driver", "D:/browserdriver/geckodriver.exe");
 42 
 43         FirefoxOptions options = new FirefoxOptions();
 44         options.setBinary("F:/ff/firefox.exe");
 45 
 46         WebDriver driver = new FirefoxDriver(options);
 47         driver.manage().window().maximize();
 48         // 超时
 49         try {
 50             driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS);
 51             driver.manage().timeouts().setScriptTimeout(3, TimeUnit.SECONDS);
 52             driver.get(url);
 53         } catch (Exception e) {
 54             System.out.println("所需元素已出现,停止加载页面");
 55         } finally {
 56             // 切换到登录login
 57             driver.switchTo().frame("login_frame");
 58 
 59             WebElement switcher_plogin = driver.findElement(By.id("switcher_plogin"));
 60             System.out.println(switcher_plogin.getText());
 61             if (switcher_plogin.isDisplayed()) {
 62                 switcher_plogin.click();
 63             }
 64             // 用户名
 65             driver.findElement(By.id("u")).clear();
 66             driver.findElement(By.id("u")).sendKeys(userName);
 67 
 68             // 密码
 69             driver.findElement(By.id("p")).clear();
 70             driver.findElement(By.id("p")).sendKeys(pwd);
 71 
 72             // 登录
 73             try {
 74                 driver.findElement(By.id("login_button")).click();
 75                 Thread.sleep(3000);
 76             } catch (Exception e) {
 77                 e.printStackTrace();
 78             } finally {
 79                 if ("https://i.qq.com/".equals(driver.getCurrentUrl())) {
 80                     System.out.println("登录失败!5秒后再次尝试登录");
 81                     Thread.sleep(5000);
 82                     driver.findElement(By.id("login_button")).click();
 83                 }
 84             }
 85 
 86             // 退出frame
 87             driver.switchTo().defaultContent();
 88 
 89             System.out.println(driver.getCurrentUrl());
 90 
 91             JavascriptExecutor jsExecutor = (JavascriptExecutor) driver;
 92             // 如果有亲密度提示
 93             
 94               try { WebElement fs_guide = driver.findElement(By.xpath(
 95               "//div[@id=‘friendship_promote_layer‘]/table[@class=‘tbl-fs-guide‘]//a"
 96              )); if(fs_guide != null && fs_guide.isDisplayed()) {
 97               fs_guide.click(); } } catch (Exception e) { e.printStackTrace();
 98               }finally {
 99              
100               }
101              
102 
103             // 点击说说
104             driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_311>a")).click();
105 
106             Thread.sleep(2000);
107 
108             // 切换到frame
109             driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));
110 
111             Thread.sleep(5000);
112 
113             // 拼接cookie
114         /*    StringBuilder builder = new StringBuilder();
115             Set<Cookie> cookieSet = driver.manage().getCookies();
116             cookieSet.forEach(c -> builder.append(c.getName()).append("=").append(c.getValue()).append("; "));
117             cookies = builder.toString();*/
118 
119             // 定位元素
120             saveTwitter(driver);
121 
122             System.out.println("内容提取完毕,退出浏览器");
123             driver.quit();
124 
125         }
126     }
127 
128     /**
129      * 序列化
130      * @param driver
131      * @return
132      * @throws InterruptedException
133      * @throws IOException
134      */
135     public static void saveTwitter(WebDriver driver) throws InterruptedException, IOException {
136         File file = new File("f:/qz/twitter.txt");
137 
138         // 文件夹检测
139         if (!file.getParentFile().exists()) {
140             file.mkdirs();
141         } else {
142             file.delete();
143         }
144 
145         
146         
147         FileWriter fileWriter = new FileWriter(file, true);
148 
149         String xpath;
150         // 模拟按键进行滚动
151         Actions actions = new Actions(driver);
152 
153         
154         //说说总量
155         String totalNumStr = driver.findElement(By.xpath("//div[@class=‘feed_num‘]/a")).getText();
156         int totalNum = Integer.parseInt(totalNumStr);
157         
158         // 计算页数
159         int totalPage = (int) Math.ceil((double)Math.min(maxSize, totalNum) / (double) pageSize);
160 
161         // 构造xpath
162         for (int i = 0; i < totalPage; i++) {
163 
164             for (int j = 0; j < pageSize; j++) {
165                 xpath = "//ol[@id=‘msgList‘]/li[" + (j + 1) + "]/div[3]/div[2]/pre[@class=‘content‘]";
166                 // 获取说说内容
167                 try {
168                     WebElement element = driver.findElement(By.xpath(xpath));
169                     String text = element.getText();
170                     System.out.println("本页第" + (j + 1) + "条   :" + text);
171                     fileWriter.write(text, 0, text.length());
172 
173                 } catch (Exception e) {
174                     e.printStackTrace();
175                 } finally {
176 
177                 }
178                 if (j % 2 == 0) {
179                     actions.sendKeys(Keys.ARROW_DOWN).perform();
180                 }
181             }
182             System.out.println("" + (i + 1) + "页说说爬取完毕");
183             // 分页
184             if ((i + 2) <= totalPage) {
185                 driver.findElement(By.xpath("//a[@id=‘pager_num_" + i + "_" + (i + 2) + "‘]")).click();
186                 // 等待页面加载
187                 Thread.sleep(3000);
188             }
189         }
190 
191         if (fileWriter != null) {
192             fileWriter.close();
193         }
194     }
195 
196 }

比爬取相册简单点,唯一有点弯的是页码的构造了,我写的这个只支持获取文字,可以用来生成词云

技术分享图片

 

selenium 爬取空间说说

标签:src   set   lis   selector   rup   iter   except   style   interrupt   

原文地址:https://www.cnblogs.com/tele-share/p/9693681.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!