您的位置:首页 > 编程语言 > Python开发

python 爬取GKGY会员多线程demo

2015-09-17 10:54 716 查看
# -*- coding: utf-8 -*-
__author__ = 'wangjingyao'
import urllib
import  urllib2
import re
import sys
import threading, Queue, time
import  user_agents,random,time

reload(sys)
sys.setdefaultencoding('utf8')#设置默认编码
_DATA = []
FILE_LOCK = threading.Lock()
SHARE_Q = Queue.Queue()  #构造一个不限制大小的的队列
_WORKER_THREAD_NUM = 10  #设置线程的个数

class MyThread(threading.Thread) :

def __init__(self, func) :
super(MyThread, self).__init__()  #调用父类的构造函数
self.func = func  #传入线程函数逻辑

def run(self) :
self.func()

def worker() :
global SHARE_Q
while not SHARE_Q.empty():
url = SHARE_Q.get() #获得任务
my_page = get_page(url)
getPageItems(my_page)  #获得当前页面的电影名
#write_into_file(temp_data)
time.sleep(1)
SHARE_Q.task_done()

def get_page(url) :
"""
根据所给的url爬取网页HTML
Args:
url: 表示当前要爬取页面的url
Returns:
返回抓取到整个页面的HTML(unicode编码)
Raises:
URLError:url引发的异常
"""
try :
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36'
headers={'User-Agent' : user_agent}
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
my_page =  response.read().encode('gbk','ignore')
return my_page
except urllib2.URLError, e :
if hasattr(e, "code"):
print "The server couldn't fulfill the request."
print "Error code: %s" % e.code
return  None
elif hasattr(e, "reason"):
print "We failed to reach a server. Please check your url and read the Reason"
print "Reason: %s" % e.reason
return  None

def getPageItems(pageCode) :
"""
通过返回的整个网页HTML, 正则匹配前100的电影名称
Args:
my_page: 传入页面的HTML文本用于正则匹配
"""
if not pageCode:
print 'pageCode init error'
return  None
# 作者爬取
pattern = re.compile('<span itemprop="author">(.*?)</span>')
items = re.findall(pattern,pageCode)
for item in items:
_DATA.append(item)
print "authorSpider------"
# 评论会员爬取
partternComment = re.compile('<div class="comment-detail"><a href=".*?">(.*?)</a>')
itemcomments= re.findall(partternComment,pageCode)
for itemcomment in itemcomments:
if itemcomment.decode('gbk') != '极客漫游者':
_DATA.append(itemcomment)
print "commentSpider------"

def main() :
global SHARE_Q
threads = []
gkgy_url ="http://www.geekpark.net/topics/{page}"
#向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务
for index in xrange(210714,213394) :
SHARE_Q.put(gkgy_url.format(page = index))
for i in xrange(_WORKER_THREAD_NUM) :
thread = MyThread(worker)
thread.start()  #线程开始处理任务
threads.append(thread)
for thread in threads :
thread.join()
SHARE_Q.join()
_DATAs=list(set(_DATA))
   with open("outGKGY.txt", "w+") as my_file :
for page in _DATAs :
my_file.write(page + "\t")
   print "Spider Successful!!!"
if __name__ == '__main__': main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: