标签:
selenium采集页面元素
phantomjs主要是模拟登录
也没多少说的,上代码吧
from selenium import webdriver import selenium.webdriver.support.ui as ui import time def crawl_cnblogs(blog_url,username,pwd): driver = webdriver.PhantomJS() driver.get("http://passport.cnblogs.com/user/signin?ReturnUrl=http%3A%2F%2Fwww.cnblogs.com%2F") wait = ui.WebDriverWait(driver, 10) wait.until(lambda dr: dr.find_element_by_id(‘signin‘).is_displayed()) driver.find_element_by_id("input1").send_keys(username) driver.find_element_by_id("input2").send_keys(pwd) driver.find_element_by_id("signin").click() wait.until(lambda dr: dr.find_element_by_id(‘login_area‘).is_displayed()) #登录成功 driver.get(blog_url) wait.until(lambda dr: dr.find_element_by_id(‘mainContent‘).is_displayed()) time.sleep(3) #articles = driver.find_element_by_xpath(‘//div[@class="postTitle"]/a‘) #为啥不成功 articles = driver.find_elements_by_class_name("postTitle") for article in articles: print article #print article.text #print article.text.decode("utf-8", "ignore") # scrapy爬虫之爬取汽车信息 编码居然错误 urls = driver.find_elements_by_class_name("postTitle2") for url in urls: print url.get_attribute("href") driver.save_screenshot(‘screen.png‘) driver.quit() if __name__ == ‘__main__‘: crawl_cnblogs("http://www.cnblogs.com/xiaoyy3/", "xiaoyaoyou3", "------password---------")
运行结果
编码错误,需要改成 print article.text.encode(‘gb18030‘)
运行结果为
标签:
原文地址:http://www.cnblogs.com/xiaoyy3/p/5980516.html