Python 爬虫笔记（不定时更新）

时间：2015-07-03 09:08:17 阅读：247 评论：0 收藏：0 [点我收藏+]

标签：

参考笔记虫师 http://www.cnblogs.com/fnng/p/3576154.html

#自动访某个网址

from selenium import webdriver
import time
M = 100000 
i = 0
URL = ‘http://www.yyxxww.com/html/2015/edu_0318/3386.html‘
browser = webdriver.Firefox() #浏览器名字，以本机安装为准
while i < M:
    browser.get(URL)
    time.sleep(1)
    i += 1
browser.quit()
print ‘本次python总共打开了‘, i, ‘次‘  


#提取一级标题

import urllib2
from sgmllib import SGMLParser
URL = ‘http://www.yyxxww.com/html/2015/edu_0318/3386.html‘ 
class ListName(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)
        self.is_h4 = ""
        self.name = []
    def start_h4(self, attrs):
        self.is_h4 = 1
    def end_h4(self):
        self.is_h4 = ""
    def handle_data(self, text):
        if self.is_h4 == 1:
            self.name.append(text)
 
content = urllib2.urlopen(URL).read()
listname = ListName()
listname.feed(content)
for item in listname.name:
    print item.decode(‘gbk‘).encode(‘utf8‘)    


#访问百度，并填写表单，中文暂时不好解决，英文没问题

# coding = utf-8
import sys
reload(sys)
sys.setdefaultencoding(‘utf8‘)
from selenium import webdriver


browser = webdriver.Firefox()

browser.get("http://www.baidu.com")
browser.find_element_by_id("kw").send_keys("你好").decode(‘gbk‘).encode(‘gb2312‘)
browser.find_element_by_id("su").click()
time.sleep(30)  # 休眠3秒
browser.quit()

Python 爬虫笔记（不定时更新）

标签：

原文地址：http://www.cnblogs.com/hdu-2010/p/4617641.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行