码迷,mamicode.com
首页 > 其他好文 > 详细

P2P爬虫-拍拍贷

时间:2015-09-15 23:06:40      阅读:299      评论:0      收藏:0      [点我收藏+]

标签:

# -*- coding: utf-8 -*-
import urllib2
import re
import os
import sqlite3
import winsound

# 打开数据库文件

ppdai_db = sqlite3.connect(r‘C:\Users\Jian Fang\Desktop\ppdai.db‘)
cursor = ppdai_db.cursor()

# 建表
# cursor.execute(‘DROP TABLE IF EXISTS tradelog‘)
# cursor.execute(‘CREATE TABLE tradelog (user_id varchar(16), money varchar(10), rate varchar(10), date varchar(12), time varchar(10))‘)


# 定义正则表达式
pattern = re.compile(r"""<tr>[^<]*                                                  """
	                  """  <td>[^<]*                                                """
                      """    <a\W*href=‘/user/[^>]*>(?P<user>[^<]*)</a>[^<]*        """
                      """  </td>[^<]*                                               """
                      """  <td>\s*                                             """
                      """    (?P<rate>\S*)[^<]*                                 """
                      """  </td>[^<]*                                               """
                      """  <td>\s*                                             """
                      """    (?P<amount>\S*)[^<]*                             """
                      """  </td>[^<]*                                               """
                      """  <td>\s*                                             """
                      """    (?P<date>\S*)\s*(?P<time>\S*)[^<]*        """
                      """  </td>[^<]*                                               """
                      """ </tr>""", 
                      re.VERBOSE | re.MULTILINE)

# 定义查找函数,返回一个dict类型
def parse(url):
    req = urllib2.Request(url, None, {‘User-Agent‘: ‘Mozilla/5.0‘})  #pretend to be a browser
    try:
        html = urllib2.urlopen(req).read()
        return [ m.groupdict() for m in pattern.finditer(html)]
    except:
        return None
		


page_start = 226153
page_end = 300000
index = 1
try: 
	for page_index in range (page_start,page_end):
		sName = ‘%d‘ %page_index
		sUrl = ‘http://www.ppdai.com/list/‘+sName	
		mat1 = parse(sUrl)
		print sName
		if mat1 != None:
			for x in mat1:			# 循环打印输出其中的每个元素
				if x[‘amount‘][6:] != ‘0‘:
					index = index + 1
					cursor.execute(‘INSERT INTO tradelog (user_id, money, rate, date, time) VALUES (?,?,?,?,?)‘, (x[‘user‘], x[‘amount‘][6:], x[‘rate‘], x[‘date‘], x[‘time‘]))	
				if index == 1000:
					index = 1
					ppdai_db.commit() 	# 保存数据库
					print ‘1000 records has been submitted!!!!!!!‘

	ppdai_db.commit()	# 保存数据库
	print ‘jobes done!‘
except:
	print ‘there is an error at‘+sName

  

P2P爬虫-拍拍贷

标签:

原文地址:http://www.cnblogs.com/aceofspades/p/4811625.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!