您的位置:首页 > 其它

广州楼盘抓取分析-分析问题

2016-02-28 21:50 399 查看
上文其实还是有不少问题的。

1.顺序执行,效率比较慢;2.不能断点执行。

那么,解决办法是什么呢?

对于问题1,可以采用生产者消费者模式来改写,代码如下

# -*- coding: utf-8 -*-
#######################################################################
# Copyright (C) 2005-2016 UC Mobile Limited. All Rights Reserved
# File          : first_sale_spider.py
#
# Creation      : 2016/2/23 19:41
# Author        : shufeng.lsf@ucweb.com
#######################################################################
import random
from threading import Thread

import requests
import re

import time
from pyquery import PyQuery as pq
from Queue import Queue
import MySQLdb
import uniout
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

community_list = []

HOST = "127.0.0.1"
USER = "root"
PASSWD = ""
DB = "house_analysis"
PORT = 3306

queue = Queue(10)

class DBOperate(object):
def __init__(self, host, user, passwd, db, port, charset="utf8"):
self.host = host
self.user = user
self.passwd = passwd
self.db = db
self.port = port
self.conn = MySQLdb.connect(self.host, self.user, self.passwd, self.db, self.port, charset="utf8")
self.cur = self.conn.cursor()

def insertSql(self,sql):
self.cur.execute(sql)
self.conn.commit()

def __del__(self):
self.cur.close()
self.conn.close()

def requestByGet(url):
r = requests.get(url)
return r.content

def getNextPage(content):
m = re.findall(r'<a href="(.+?)" class="next-page next-link">下一页</a>',content)
if len(m)>0:
next_url = m[0]
else:
next_url = ''
return next_url

def getCommunityList(content):
community_urls = re.findall(r'data-link="(http://gz.fang.anjuke.com/loupan/\d+?.html)"',content)
print "正在采集...",community_urls
if len(community_urls)>0:
return community_urls

def getHouseInfo(url):
p = pq(url)
name = p('h1').text().strip()
style = p('.house-item').text().split(",")[0].strip()
price = p('.sp-price').text().strip()
l = p('.lpAddr-text').text()
location = re.split('\[ | \]',l)
area = location[-2].split('-')[0].strip()
zone = location[-2].split('-')[1].strip()
address = location[-1].strip()
detail_location = location[-1].strip()
result = {
"name": name,
"area": area,
"location": zone,
"detail_location": detail_location,
"house_style": style,
"price": price
}
return result

def detailPageHandler(cur, detail_url):
result = getHouseInfo(detail_url)
print "result:",result
cur.insertSql("insert into first_sale (name,area,location,detail_location,house_style,price) VALUES('%s','%s','%s','%s','%s','%s')" % (
result['name'],
result['area'],
result['location'],
result['detail_location'],
result['house_style'],
result['price']
))

class UrlProducer(Thread):
def __init__(self, start_url):
Thread.__init__(self)
self.start_url = start_url

def run(self):
global queue
while True:
content = requestByGet(self.start_url)
next_url = getNextPage(content)
community_urls = getCommunityList(content)
for url in community_urls:
queue.put(url)
time.sleep(random.random())
print "进入队列的url:",url
if next_url != '':
self.start_url = next_url
continue
else:
break

class GetHouseInfo(Thread):
def __init__(self, cur):
Thread.__init__(self)
self.cur = cur

def run(self):
global queue
while True:
url = queue.get()
detailPageHandler(self.cur, url)
queue.task_done()
time.sleep(random.random())
print "处理完毕的url:", url

def main():
cur = DBOperate(host=HOST, user=USER, passwd=PASSWD, db=DB, port=PORT)
UrlProducer("http://gz.fang.anjuke.com/loupan/?from=navigation").start()
GetHouseInfo(cur).start()

if __name__ == '__main__':
main()


2.对于不能断点执行的问题,可以用异常捕获的方式将当前执行的url保存下来,下次直接从文件中读取执行即可。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: