您的位置:首页 > 其它

P2P爬虫-拍拍贷

2015-09-15 21:55 387 查看
# -*- coding: utf-8 -*-
import urllib2
import re
import os
import sqlite3
import winsound

# 打开数据库文件

ppdai_db = sqlite3.connect(r'C:\Users\Jian Fang\Desktop\ppdai.db')
cursor = ppdai_db.cursor()

# 建表
# cursor.execute('DROP TABLE IF EXISTS tradelog')
# cursor.execute('CREATE TABLE tradelog (user_id varchar(16), money varchar(10), rate varchar(10), date varchar(12), time varchar(10))')

# 定义正则表达式
pattern = re.compile(r"""<tr>[^<]*                                                  """
"""  <td>[^<]*                                                """
"""    <a\W*href='/user/[^>]*>(?P<user>[^<]*)</a>[^<]*        """
"""  </td>[^<]*                                               """
"""  <td>\s*                                             """
"""    (?P<rate>\S*)[^<]*                                 """
"""  </td>[^<]*                                               """
"""  <td>\s*                                             """
"""    (?P<amount>\S*)[^<]*                             """
"""  </td>[^<]*                                               """
"""  <td>\s*                                             """
"""    (?P<date>\S*)\s*(?P<time>\S*)[^<]*        """
"""  </td>[^<]*                                               """
""" </tr>""",
re.VERBOSE | re.MULTILINE)

# 定义查找函数,返回一个dict类型
def parse(url):
req = urllib2.Request(url, None, {'User-Agent': 'Mozilla/5.0'})  #pretend to be a browser
try:
html = urllib2.urlopen(req).read()
return [ m.groupdict() for m in pattern.finditer(html)]
except:
return None

page_start = 226153
page_end = 300000
index = 1
try:
for page_index in range (page_start,page_end):
sName = '%d' %page_index
sUrl = 'http://www.ppdai.com/list/'+sName
mat1 = parse(sUrl)
print sName
if mat1 != None:
for x in mat1:			# 循环打印输出其中的每个元素
if x['amount'][6:] != '0':
index = index + 1
cursor.execute('INSERT INTO tradelog (user_id, money, rate, date, time) VALUES (?,?,?,?,?)', (x['user'], x['amount'][6:], x['rate'], x['date'], x['time']))
if index == 1000:
index = 1
ppdai_db.commit() 	# 保存数据库
print '1000 records has been submitted!!!!!!!'

ppdai_db.commit()	# 保存数据库
print 'jobes done!'
except:
print 'there is an error at'+sName


  
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: