Python 简单爬虫功能实现

时间：2015-07-30 15:08:25 阅读：159 评论：0 收藏：0 [点我收藏+]

当Google创始人用python写下他们第一个简陋的爬虫, 运行在同样简陋的服务器上的时候 ;
很少有人能够想象 , 在接下的数十年间 , 他们是怎样地颠覆了互联网乃至于人类的世界。

有网络的地方就有爬虫，爬虫英文名称spider。它是用来抓取网站数据的程序。比如: 我们通过一段程序，定期去抓取类似百度糯米、大众点评上的数据，将这些信息存储到数据库里，然后加上展示页面，一个团购导航站就问世了。毫无疑问，爬虫是很多网站的初期数据来源。

一、第一个爬虫功能的实现

——查看博文目录第一篇文章的URL

首先需要引入urllib模块，使用find函数查找url，经过字符处理就都得到了需要的URL。

#!/usr/bin/env python
import urllib
url = [‘‘]*40
i = 0
con = urllib.urlopen(‘http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html‘).read()
title = con.find(r‘<a title=‘)
href = con.find(r‘href=‘,title)
html = con.find(r‘.html‘,href)
url = con[href +6 :html +5 ]
print url

二、查看博文目录第一页所有文章的URL

A：

#!/usr/bin/env python
import urllib
url = [‘‘]*40
i = 0
con = urllib.urlopen(‘http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html‘).read()
title = con.find(r‘<a title=‘)
href = con.find(r‘href=‘,title)
html = con.find(r‘.html‘,href)
url[0] = con[href +6 :html +5 ]
print url
while title != -1 and href != -1 and html != -1 and i < 40:
    url[i] = con[href +6 :html +5 ]
    print url[i]
    title = con.find(r‘<a title=‘,html)
    href = con.find(r‘href=‘,title)
    html = con.find(r‘.html‘,href)
    i = i +1

或者B：

#!/usr/bin/env python
import urllib
i = 0
con = urllib.urlopen(‘http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html‘).read()
title = con.find(r‘<a title=‘)
href = con.find(r‘href=‘,title)
html = con.find(r‘.html‘,href)
url = con[href +6 :html +5 ]
while title != -1 and href != -1 and html != -1 and i < 50:
    title = con.find(r‘<a title=‘,html)
    href = con.find(r‘href=‘,title)
    html = con.find(r‘.html‘,href)
    url = con[href +6 :html +5 ]
    print url
    i = i + 1

三、下载博文目录第一页所有的文章

A：

#!/usr/bin/env python
import urllib
i = 0
url = [‘‘]*40
con = urllib.urlopen(‘http://www.zhihu.com/collection/19668036‘).read()
target = con.find(r‘<a target="_blank‘)
base = con.find(r‘href=‘,target)
end = con.find(‘>‘,base)
url[0] = ‘http://www.zhihu.com‘ + con[target +25 :end - 1]
print url[0]
while i < 20:
  url[0] = ‘http://www.zhihu.com‘ + con[target +25 :end - 1]
  print url[0]
  target = con.find(r‘<a target="_blank‘,end)
  base = con.find(r‘href=‘,target)
  end = con.find(‘>‘,base)
  i = i + 1
while j < 30:
    content = urllib.urlopen(url[j]).read()
    print url[0]
    open(r‘zhihu/‘+url[j],‘w+‘).write(content)
    print ‘downloading‘,
    j = j + 1
    time.sleep(15)

或者B：

#!/usr/bin/env python
import time
import urllib
i = 0
j = 0
url = [‘‘]*30
name = [‘‘]*30
con = urllib.urlopen(‘http://www.zhihu.com/collection/19668036‘).read()
target = con.find(r‘<a target="_blank‘)
base = con.find(r‘href=‘,target)
end = con.find(‘>‘,base)
url[0] = ‘http://www.zhihu.com‘ + con[target +25 :end - 1]
while target != -1 and base != -1 and end != -1 and i < 30:
  url[0] = ‘http://www.zhihu.com‘ + con[target +25 :end - 1]
  name[0] =  con[base +16 :end - 1]
  target = con.find(r‘<a target="_blank‘,end)
  base = con.find(r‘href=‘,target)
  end = con.find(‘>‘,base)
  content = urllib.urlopen(url[0]).read()
  open(r‘zhihu/‘+name[0]+‘.html‘,‘w+‘).write(content)
  print ‘downloading‘,name[0]
  time.sleep(5)
  i = i + 1

四、下载所有文章

A：

import time
import urllib
page = 1
url = [‘‘]*350
i = 0
link = 1
while page <= 7:
  con = urllib.urlopen(‘http://blog.sina.com.cn/s/articlelist_1191258123_0_‘+str(page)+‘.html‘).read()
  title = con.find(r‘<a title=‘)
  href = con.find(r‘href=‘,title)
  html = con.find(r‘.html‘,href)
  while title != -1 and href != -1 and html != -1 and i < 350:
    url[i] = con[href +6 :html +5 ]
    print link,url[i]
    title = con.find(r‘<a title=‘,html)
    href = con.find(r‘href=‘,title)
    html = con.find(r‘.html‘,href)
    link = link + 1
    i = i +1
  else:
    print ‘find end!‘
  page = page + 1
else:
    print ‘all find end‘
j = 0
while j < 50:
    content = urllib.urlopen(url[j]).read()
    open(r‘tmp/‘+url[j][-26:],‘w+‘).write(content)
    j = j + 1
    time.sleep(5)
else:
    print ‘Download over!‘

B：

#!/usr/bin/env python
import time
import urllib
i = 0
link = 1
page = 1
url = [‘‘]*350
while page <= 7:
  con = urllib.urlopen(‘http://blog.sina.com.cn/s/articlelist_1191258123_0_‘+str(page)+‘.html‘).read()
  title = con.find(r‘<a title=‘)
  href = con.find(r‘href=‘,title)
  html = con.find(r‘.html‘,href)
  while title != -1 and href != -1 and html != -1 and i < 350:
    url[i] = con[href +6 :html +5 ]
    print link,url[i]
    title = con.find(r‘<a title=‘,html)
    href = con.find(r‘href=‘,title)
    html = con.find(r‘.html‘,href)
    content = urllib.urlopen(url[i]).read()
    open(r‘/tmp/sina/‘+url[i][-26:],‘w+‘).write(content)
    time.sleep(5)
    link = link + 1
    i = i +1
  page = page + 1
else:
    print ‘Download Over!‘

运行结果：

本文出自 “World” 博客，请务必保留此出处http://xiajie.blog.51cto.com/6044823/1679997

Python 简单爬虫功能实现

标签：linux 爬虫 python

原文地址：http://xiajie.blog.51cto.com/6044823/1679997

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行