标签:
# -*- coding: utf-8 -*- import urllib2 import re import os import sqlite3 import winsound # 打开数据库文件 ppdai_db = sqlite3.connect(r‘C:\Users\Jian Fang\Desktop\ppdai.db‘) cursor = ppdai_db.cursor() # 建表 # cursor.execute(‘DROP TABLE IF EXISTS tradelog‘) # cursor.execute(‘CREATE TABLE tradelog (user_id varchar(16), money varchar(10), rate varchar(10), date varchar(12), time varchar(10))‘) # 定义正则表达式 pattern = re.compile(r"""<tr>[^<]* """ """ <td>[^<]* """ """ <a\W*href=‘/user/[^>]*>(?P<user>[^<]*)</a>[^<]* """ """ </td>[^<]* """ """ <td>\s* """ """ (?P<rate>\S*)[^<]* """ """ </td>[^<]* """ """ <td>\s* """ """ (?P<amount>\S*)[^<]* """ """ </td>[^<]* """ """ <td>\s* """ """ (?P<date>\S*)\s*(?P<time>\S*)[^<]* """ """ </td>[^<]* """ """ </tr>""", re.VERBOSE | re.MULTILINE) # 定义查找函数,返回一个dict类型 def parse(url): req = urllib2.Request(url, None, {‘User-Agent‘: ‘Mozilla/5.0‘}) #pretend to be a browser try: html = urllib2.urlopen(req).read() return [ m.groupdict() for m in pattern.finditer(html)] except: return None page_start = 226153 page_end = 300000 index = 1 try: for page_index in range (page_start,page_end): sName = ‘%d‘ %page_index sUrl = ‘http://www.ppdai.com/list/‘+sName mat1 = parse(sUrl) print sName if mat1 != None: for x in mat1: # 循环打印输出其中的每个元素 if x[‘amount‘][6:] != ‘0‘: index = index + 1 cursor.execute(‘INSERT INTO tradelog (user_id, money, rate, date, time) VALUES (?,?,?,?,?)‘, (x[‘user‘], x[‘amount‘][6:], x[‘rate‘], x[‘date‘], x[‘time‘])) if index == 1000: index = 1 ppdai_db.commit() # 保存数据库 print ‘1000 records has been submitted!!!!!!!‘ ppdai_db.commit() # 保存数据库 print ‘jobes done!‘ except: print ‘there is an error at‘+sName
标签:
原文地址:http://www.cnblogs.com/aceofspades/p/4811625.html