码迷,mamicode.com
首页 > 编程语言 > 详细

python爬虫笔记

时间:2015-06-02 12:46:20      阅读:126      评论:0      收藏:0      [点我收藏+]

标签:

 1 import urllib2
 2 response = urllib2.urlopen("http://www.baidu.com")
 3 html = response.read()
 4 
 5 #eg2
 6 import urllib2
 7 req = urllib2.Request("http://www.baidu.com")
 8 response = urllib2.urllib2(req)
 9 the_page = response.read()
10 
11 #eg3 POST传送数据
12 import urllib
13 import urllib2
14 
15 url = "http://www.msdn.com"
16 values={name:Xu,
17         location:YJ,
18         language:Python}
19 
20 data = urllib.urlencode(values)
21 req = urllib2.Request(url,data) #发送请求,同时传送data表单
22 response = urllib2.urlopen(req) #接收数据
23 the_page = response.read()
24 
25 #eg4 GET传送数据
26 #
27 
28 #eg5 加入User-Agent
29 import urllib
30 import urllib2
31 
32 url = "http://www.msdn.com"
33 user_agent = Mozilla/4.0(compatible;MSIE 5.5;Windows NT)
34 values={name:Xu,
35         location:YJ,
36         language:Python}
37 
38 headers = {User-Agent:user_agent}
39 data = urllib.urlencode(values)
40 req = urllib2.Request(url,data,headers) #发送请求,同时传送data表单和User-agent
41 response = urllib2.urlopen(req) #接收数据
42 the_page = response.read()
43 
44 #eg6捕获异常
45 try:
46     response = urllib2.urlopen(req) #接收数据
47 except urllib2.URLError,e:
48     print e.reason
49     print e.code    #404 or 500...
50 #way2
51 try:
52     response = urllib2.urlopen(req) #接收数据
53 except urllib2.HTTPError,e:
54     print e.code    #404 or 500...
55 except urllib2.URLError,e:
56     print e.reason
57 
58 #way3. we command to handle exception in this way
59 try:
60     response = urllib2.urlopen(req) #接收数据
61 except urllib2.URLError,e:
62     if hasattr(e,code):
63         print Error code:,e.code
64     elif hasattr(e,reason):
65         print Reason:,e.reason
66 
67 #eg7
68 from urllib2 import Request,urlopen,URLError,HTTPError
69 old_url = "http://www.baidu.com"
70 req = Request(old_url)
71 response = urlopen(req)
72 rel_url = response.geturl()
73 info = response.info()
74 
75 #eg8 cookie
76 import urllib2
77 import cookielib
78 cookie = cookielib.CookieJar()
79 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
80 response = opener.open("http://www.baidu.com")
81 for item in cookie:
82     print item.name,item.
83 
84 #eg9 正则表达式
85 import re
86 pattern = re.compile(r"hello")
87 match1 = pattern.match("hello world")
88 if match1:
89     print match1.group()
90 else:
91     print "match失败"

 

python爬虫笔记

标签:

原文地址:http://www.cnblogs.com/premier/p/4545807.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!