python---多线程采集示例
2017-11-19 17:20
204 查看
# coding:utf-8 import urllib2 import time from lxml import etree from bs4 import BeautifulSoup import requests import types import sys import json from Queue import Queue import threading CRAWL_EXIT = False PARSE_EXIT = False class ThreadCrawl(threading.Thread): def __init__(self, threadName, pageQueue, dataQueue): # threading.Thread.__init__(self) super(ThreadCrawl, self).__init__() self.threadName = threadName self.pageQueue = pageQueue self.dataQueue = dataQueue self.headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} def run(self): print "启动 " + self.threadName while not CRAWL_EXIT: try: page = self.pageQueue.get(False) url = "https://tieba.baidu.com/f?kw=%E5%A6%B9%E5%AD%90&ie=utf-8&pn=" + str(page * 50) content = requests.get(url, headers=self.headers).text time.sleep(1) self.dataQueue.put(content) except: pass print "结束 " + self.threadName class ThreadParse(threading.Thread): def __init__(self, threadName, dataQueue, filename, lock): super(ThreadParse, self).__init__() self.threadName = threadName self.dataQueue = dataQueue self.fileName = filename self.lock = lock def run(self): print "启动" + self.threadName while not PARSE_EXIT: try: html = self.dataQueue.get(False) self.parse(html) except: pass print "退出" + self.threadName def parse(self, html): html = etree.HTML(html) nodeList = html.xpath('//*[@id="thread_list"]//li/div/div[2]/div[1]/div[1]/a') for title in nodeList: items = { "title" : title.text } with self.lock: self.fileName.write(json.dumps(items, ensure_ascii=False).encode("utf-8") + "\n") def main(): pageQueue = Queue(10) for i in range(1, 11): pageQueue.put(i) dataQueue = Queue() filename = open("duanzi.json", "a") lock = threading.Lock() # 三个采集线程的名字 crawlList = ["采集线程1号", "采集线程2号", "采集线程3号"] # 存储三个采集线程的列表集合 threadcrawl = [] for threadName in crawlList: thread = ThreadCrawl(threadName, pageQueue, dataQueue) thread.start() threadcrawl.append(thread) # 三个解析线程的名字 parseList = ["解析线程1号", "解析线程2号", "解析线程3号"] # 存储三个解析线程 threadparse = [] for threadName in parseList: thread = ThreadParse(threadName, dataQueue, filename, lock) thread.start() threadparse.append(thread) while not pageQueue.empty(): pass # 如果pageQueue为空,采集线程退出循环 global CRAWL_EXIT CRAWL_EXIT = True print "pageQueue is empty" for thread in threadcrawl: thread.join() print "1" while not dataQueue.empty(): pass global PARSE_EXIT PARSE_EXIT = True for thread in threadparse: thread.join() print "2" with lock: # 关闭文件 filename.close() print "谢谢使用!" if __name__ == "__main__": main()
相关文章推荐
- Python之多线程爬虫抓取网页图片的示例代码
- Python爬虫—多线程的简单示例
- python 多线程示例
- php与python实现的线程池多线程爬虫功能示例
- python的多线程示例
- Python(2.7.x)多线程的简单示例
- Python 3.X 调用多线程C模块,并在C模块中回调python函数的示例
- python 多线程采集网页完善版
- python网络爬虫采集联想词示例
- python实现的多线程端口扫描功能示例
- 尝试使用Python多线程抓取代理服务器IP地址的示例
- 【转】杰奇 jieqi 多线程自动采集同步源站 python源码
- python数据采集与多线程效率分析
- 基于Python多线程的TCP客户端/服务端应用示例
- python的多线程、多进程代码示例
- Python实现多线程HTTP下载器示例
- Python网络编程基于多线程实现多用户全双工聊天功能示例
- Python实现的多进程和多线程功能示例
- 尝试使用Python多线程抓取代理服务器IP地址的示例
- Python多线程应用示例