python爬虫笔记

时间：2015-06-02 12:46:20 阅读：126 评论：0 收藏：0 [点我收藏+]

标签：

 1 import urllib2
 2 response = urllib2.urlopen("http://www.baidu.com")
 3 html = response.read()
 4 
 5 #eg2
 6 import urllib2
 7 req = urllib2.Request("http://www.baidu.com")
 8 response = urllib2.urllib2(req)
 9 the_page = response.read()
10 
11 #eg3 POST传送数据
12 import urllib
13 import urllib2
14 
15 url = "http://www.msdn.com"
16 values={‘name‘:‘Xu‘,
17         ‘location‘:‘YJ‘,
18         ‘language‘:‘Python‘}
19 
20 data = urllib.urlencode(values)
21 req = urllib2.Request(url,data) #发送请求，同时传送data表单
22 response = urllib2.urlopen(req) #接收数据
23 the_page = response.read()
24 
25 #eg4 GET传送数据
26 #略
27 
28 #eg5 加入User-Agent
29 import urllib
30 import urllib2
31 
32 url = "http://www.msdn.com"
33 user_agent = ‘Mozilla/4.0(compatible;MSIE 5.5;Windows NT)‘
34 values={‘name‘:‘Xu‘,
35         ‘location‘:‘YJ‘,
36         ‘language‘:‘Python‘}
37 
38 headers = {‘User-Agent‘:user_agent}
39 data = urllib.urlencode(values)
40 req = urllib2.Request(url,data,headers) #发送请求，同时传送data表单和User-agent
41 response = urllib2.urlopen(req) #接收数据
42 the_page = response.read()
43 
44 #eg6捕获异常
45 try:
46     response = urllib2.urlopen(req) #接收数据
47 except urllib2.URLError,e:
48     print e.reason
49     print e.code    #404 or 500...
50 #way2
51 try:
52     response = urllib2.urlopen(req) #接收数据
53 except urllib2.HTTPError,e:
54     print e.code    #404 or 500...
55 except urllib2.URLError,e:
56     print e.reason
57 
58 #way3. we command to handle exception in this way
59 try:
60     response = urllib2.urlopen(req) #接收数据
61 except urllib2.URLError,e:
62     if hasattr(e,‘code‘):
63         print ‘Error code:‘,e.code
64     elif hasattr(e,‘reason‘):
65         print ‘Reason:‘,e.reason
66 
67 #eg7
68 from urllib2 import Request,urlopen,URLError,HTTPError
69 old_url = "http://www.baidu.com"
70 req = Request(old_url)
71 response = urlopen(req)
72 rel_url = response.geturl()
73 info = response.info()
74 
75 #eg8 cookie
76 import urllib2
77 import cookielib
78 cookie = cookielib.CookieJar()
79 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
80 response = opener.open("http://www.baidu.com")
81 for item in cookie:
82     print item.name,item.
83 
84 #eg9 正则表达式
85 import re
86 pattern = re.compile(r"hello")
87 match1 = pattern.match("hello world")
88 if match1:
89     print match1.group()
90 else:
91     print "match失败"

python爬虫笔记

标签：

原文地址：http://www.cnblogs.com/premier/p/4545807.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行