python 爬取GKGY会员多线程demo
2015-09-17 10:54
716 查看
# -*- coding: utf-8 -*- __author__ = 'wangjingyao' import urllib import urllib2 import re import sys import threading, Queue, time import user_agents,random,time reload(sys) sys.setdefaultencoding('utf8')#设置默认编码 _DATA = [] FILE_LOCK = threading.Lock() SHARE_Q = Queue.Queue() #构造一个不限制大小的的队列 _WORKER_THREAD_NUM = 10 #设置线程的个数 class MyThread(threading.Thread) : def __init__(self, func) : super(MyThread, self).__init__() #调用父类的构造函数 self.func = func #传入线程函数逻辑 def run(self) : self.func() def worker() : global SHARE_Q while not SHARE_Q.empty(): url = SHARE_Q.get() #获得任务 my_page = get_page(url) getPageItems(my_page) #获得当前页面的电影名 #write_into_file(temp_data) time.sleep(1) SHARE_Q.task_done() def get_page(url) : """ 根据所给的url爬取网页HTML Args: url: 表示当前要爬取页面的url Returns: 返回抓取到整个页面的HTML(unicode编码) Raises: URLError:url引发的异常 """ try : user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36' headers={'User-Agent' : user_agent} request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request) my_page = response.read().encode('gbk','ignore') return my_page except urllib2.URLError, e : if hasattr(e, "code"): print "The server couldn't fulfill the request." print "Error code: %s" % e.code return None elif hasattr(e, "reason"): print "We failed to reach a server. Please check your url and read the Reason" print "Reason: %s" % e.reason return None def getPageItems(pageCode) : """ 通过返回的整个网页HTML, 正则匹配前100的电影名称 Args: my_page: 传入页面的HTML文本用于正则匹配 """ if not pageCode: print 'pageCode init error' return None # 作者爬取 pattern = re.compile('<span itemprop="author">(.*?)</span>') items = re.findall(pattern,pageCode) for item in items: _DATA.append(item) print "authorSpider------" # 评论会员爬取 partternComment = re.compile('<div class="comment-detail"><a href=".*?">(.*?)</a>') itemcomments= re.findall(partternComment,pageCode) for itemcomment in itemcomments: if itemcomment.decode('gbk') != '极客漫游者': _DATA.append(itemcomment) print "commentSpider------" def main() : global SHARE_Q threads = [] gkgy_url ="http://www.geekpark.net/topics/{page}" #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务 for index in xrange(210714,213394) : SHARE_Q.put(gkgy_url.format(page = index)) for i in xrange(_WORKER_THREAD_NUM) : thread = MyThread(worker) thread.start() #线程开始处理任务 threads.append(thread) for thread in threads : thread.join() SHARE_Q.join() _DATAs=list(set(_DATA)) with open("outGKGY.txt", "w+") as my_file : for page in _DATAs : my_file.write(page + "\t") print "Spider Successful!!!"if __name__ == '__main__': main()
相关文章推荐
- 零基础学python-18.1 函数的设计
- 零基础学python-18.1 函数的设计
- Python如何解析动态网页
- python写的小程序--构造大量测试数据
- python上的并发
- [python] 0x7 Python Tutorial: Web Scanning and Exploitation
- 一、安装pip
- python静态方法和类方法
- Python字符串反转
- 配置豆瓣镜像作为python 库的下载源
- Python 多线程学习
- Python 运算符
- python序列类型
- Python学习笔记1
- 谈谈exifread
- python相关——如何安装pip
- (15/09/16)学python第三天
- 轻松python文本专题-判断对象里面是否是类字符串(推荐使用isinstance(obj,str))
- 零基础学python-18.1 函数的设计
- 零基础学python-8.7 字典常用方法