码迷,mamicode.com
首页 > Web开发 > 详细

Selenium FF WebDriver 遍历所有链接(另类爬虫)

时间:2014-11-23 23:01:23      阅读:351      评论:0      收藏:0      [点我收藏+]

标签:style   blog   http   io   ar   color   os   使用   sp   

请看这个页面,我想要找到某个公告的内容,必须一个一个打开链接,尼玛好多啊。

于是,我机智的使用selenium打开每一个链接,然后把公告内容写入txt

那需要做一下步奏

1.依次打开一个公告

2.切换focus到新窗口,找到公告内容,写到txt

3.关闭该窗口

4.切换到主窗口

5.当前页面遍历完,点击下一页

6.重复步奏1

 

bubuko.com,布布扣

由于下一页是一个很好用的flag,就可以当做循环条件,因为最后一页没有下一页的element

接下来要找到相关的的xpath

列表数目: count(//tr/td/a[starts-with(@href,‘article_show.asp?ID=‘) and @title!=‘‘ ])
列表:      //tr/td/a[starts-with(@href,‘article_show.asp?ID=‘) and @title!=‘‘ ]
下一页:   //div/a[text()=‘下一页‘]

selenium WebDriver测试网页时,点击target=”_blank”的链接,在打开新页面,切换到新窗口的

这要使用

String currentWindow = driver.getWindowHandle();//获取当前窗口句柄
Set<String> handles = driver.getWindowHandles();//获取所有窗口句

WebDriver window = driver.switchTo().window(it.next());//切换到新窗口

driver.switchTo().window(currentWindow);//回到原来页面

driver=driver.switchTo().window(driver.getWindowHandle()); //把下一页变成当前driver

bubuko.com,布布扣
currentWindow = driver.getWindowHandle();
            //get all windows
            Set<String> handles= driver.getWindowHandles();
            for (String s : handles)
            {
                //current page is don‘t close
                if (s.equals(currentWindow))
                    continue;
                else 
                {
                    window =driver.switchTo().window(s);
                                }
                             window .close() ;
                        }
driver.switchTo().window(currentWindow);
View Code

具体代码

 

package com.packt.webdriver.chapter3;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;

public class TraversalAllLinks {
    private static String currentWindow;

    public static void main(String[] args) {
    
 
        WebDriver driver=DriverFactory.getFirefoxDriver();
        driver.get("http://www.lhgtj.gov.cn/article.asp?ClassID=86&page=1");
    
        driver.manage().window().maximize();
        driver.manage().timeouts().implicitlyWait(60, TimeUnit.SECONDS);
        driver.manage().timeouts().pageLoadTimeout(60, TimeUnit.SECONDS);
        WebElement nextPage=driver.findElement(By.xpath("//tr/td/a[@title=‘下一页‘]"));
        while(nextPage.isDisplayed())
        {    

        List<WebElement> links=driver.findElements(By.xpath("//tr/td/a[starts-with(@href,‘article_show.asp?ID=‘) and @title!=‘‘ ]"));
        
        for(WebElement link:links)
        {
            WebDriver window;
            System.out.println(link.getText());
            try {
                writeToTXT(link.getText());
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }
            link.click();
            currentWindow = driver.getWindowHandle();
            //get all windows
            Set<String> handles= driver.getWindowHandles();
            for (String s : handles)
            {
                //current page is don‘t close
                if (s.equals(currentWindow))
                    continue;
                else 
                {
                    window =driver.switchTo().window(s);
                    window.manage().window().maximize();
                    window.manage().timeouts().implicitlyWait(60, TimeUnit.SECONDS);
                    window.manage().timeouts().pageLoadTimeout(60, TimeUnit.SECONDS);
                    //get all tables
                    List<WebElement> tbs=window.findElements(By.xpath("//tbody/tr/td/p"));
                    for(WebElement tb:tbs)
                    {
                        System.out.println(tb.getText());
                        try {
                            writeToTXT(tb.getText()+"\n");
                        } catch (IOException e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        }
                     
                    }
                    //close the table window
                    window .close() ;
                }
            //swich to current window
            driver.switchTo().window(currentWindow);
          }    
            
        }
        // click next page
        nextPage.click();
        //set next page to current page
        driver=driver.switchTo().window(driver.getWindowHandle());
        driver.manage().window().maximize();
        driver.manage().timeouts().implicitlyWait(60, TimeUnit.SECONDS);
        driver.manage().timeouts().pageLoadTimeout(60, TimeUnit.SECONDS);
        nextPage=driver.findElement(By.xpath("//tr/td/a[@title=‘下一页‘]"));
        
        }

        
    }
    //write logs
    public static void  writeToTXT(String message) throws IOException
    {
        BufferedWriter bf = null;
        try {
            //set true ,avoid 
            bf = new BufferedWriter(new FileWriter("report.txt", true));
            bf.write(message);
            bf.flush();
           
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        finally
        {
             bf.close();
        }
    
    }

}

DriverFactory

bubuko.com,布布扣
public static WebDriver getFirefoxDriver()
    {
        try
        {
            WindowsUtils.tryToKillByName("firefox.exe");
        }
        catch(Exception e)
        {
            System.out.println("can not find firefox process");
        }
        File file=new File("d:\\firebug-2.0.4-fx.xpi");
        FirefoxProfile profile = new FirefoxProfile();
 
 
        try {
            profile.addExtension(file);
            profile.setPreference("extensions.firebug.currentVersion", "2.0.4");
            profile.setPreference("extensions.firebug.allPagesActivation", "on");
        } catch (IOException e3) {
            // TODO Auto-generated catch block
            e3.printStackTrace();
        }
     
        WebDriver driver = new FirefoxDriver(profile);
        return driver;
        
    }
View Code

 

Selenium FF WebDriver 遍历所有链接(另类爬虫)

标签:style   blog   http   io   ar   color   os   使用   sp   

原文地址:http://www.cnblogs.com/tobecrazy/p/4117506.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!