码迷,mamicode.com
首页 > 其他好文 > 详细

定向爬虫实战笔记

时间:2015-12-20 22:35:04      阅读:317      评论:0      收藏:0      [点我收藏+]

标签:

定向爬虫实战笔记

定向爬虫实战笔记

流程图如下:

技术分享

来自追女神助手(痴汉)v0.1:

1.#-*-coding:utf8-*-
2.
3.import smtplib
4.from email.mime.text import MIMEText
5.import requests
6.from lxml import etree
7.import os
8.import time
9.import sys
10.reload(sys)
11.sys.setdefaultencoding(‘utf-8‘)
12.
13.
14.
15.class mailhelper(object):
16. ‘‘‘
17. 这个类实现发送邮件的功能
18. ‘‘‘

19. def __init__(self):
20.
21. self.mail_host="smtp.xxxx.com" #设置服务器
22. self.mail_user="xxxx" #用户名
23. self.mail_pass="xxxx" #密码
24. self.mail_postfix="xxxx.com" #发件箱的后缀
25.
26. def send_mail(self,to_list,sub,content):
27. me="xxoohelper"+"<"+self.mail_user+"@"+self.mail_postfix+">"
28. msg = MIMEText(content,_subtype=‘plain‘,_charset=‘utf-8‘)
29. msg[‘Subject‘] = sub
30. msg[‘From‘] = me
31. msg[‘To‘] = ";".join(to_list)
32. try:
33. server = smtplib.SMTP()
34. server.connect(self.mail_host)
35. server.login(self.mail_user,self.mail_pass)
36. server.sendmail(me, to_list, msg.as_string())
37. server.close()
38. return True
39. except Exception, e:
40. print str(e)
41. return False
42.
43.class xxoohelper(object):
44. ‘‘‘
45. 这个类实现将爬取微博第一条内容
46. ‘‘‘

47. def __init__(self):
48. self.url = ‘http://weibo.cn/u/xxxxxxx‘ #请输入准备抓取的微博地址
49. self.url_login = ‘https://login.weibo.cn/login/‘
50. self.new_url = self.url_login
51.
52. def getSource(self):
53. html = requests.get(self.url).content
54. return html
55.
56. def getData(self,html):
57. selector = etree.HTML(html)
58. password = selector.xpath(‘//input[@type="password"]/@name‘)[0]
59. vk = selector.xpath(‘//input[@name="vk"]/@value‘)[0]
60. action = selector.xpath(‘//form[@method="post"]/@action‘)[0]
61. self.new_url = self.url_login + action
62. data = {
63. ‘mobile‘ : ‘xxxxx@xxx.com‘,
64. password : ‘xxxxxx‘,
65. ‘remember‘ : ‘on‘,
66. ‘backURL‘ : ‘http://weibo.cn/u/xxxxxx‘, #此处请修改为微博地址
67. ‘backTitle‘ : u‘微博‘,
68. ‘tryCount‘ : ‘‘,
69. ‘vk‘ : vk,
70. ‘submit‘ : u‘登录‘
71. }
72. return data
73.
74. def getContent(self,data):
75. newhtml = requests.post(self.new_url,data=data).content
76. new_selector = etree.HTML(newhtml)
77. content = new_selector.xpath(‘//span[@class="ctt"]‘)
78. newcontent = unicode(content[2].xpath(‘string(.)‘)).replace(‘http://‘,‘‘)
79. sendtime = new_selector.xpath(‘//span[@class="ct"]/text()‘)[0]
80. sendtext = newcontent + sendtime
81. return sendtext
82.
83. def tosave(self,text):
84. f= open(‘weibo.txt‘,‘a‘)
85. f.write(text + ‘\n‘)
86. f.close()
87.
88. def tocheck(self,data):
89. if not os.path.exists(‘weibo.txt‘):
90. return True
91. else:
92. f = open(‘weibo.txt‘, ‘r‘)
93. existweibo = f.readlines()
94. if data + ‘\n‘ in existweibo:
95. return False
96. else:
97. return True
98.
99.if __name__ == ‘__main__‘:
100. mailto_list=[‘xxxxx@qq.com‘] #此处填写接收邮件的邮箱
101. helper = xxoohelper()
102. while True:
103. source = helper.getSource()
104. data = helper.getData(source)
105. content = helper.getContent(data)
106. if helper.tocheck(content):
107. if mailhelper().send_mail(mailto_list,u"女神更新啦",content):
108. print u"发送成功"
109. else:
110. print u"发送失败"
111. helper.tosave(content)
112. print content
113. else:
114. print u‘pass‘
115. time.sleep(30)

定向爬虫实战笔记

标签:

原文地址:http://www.cnblogs.com/XBlack/p/5061823.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!