您的位置:首页 > 编程语言 > Python开发

python---多线程采集示例

2017-11-19 17:20 204 查看
# coding:utf-8
import urllib2

import time
from lxml import etree
from bs4 import BeautifulSoup
import requests
import types
import sys
import json
from Queue import Queue
import threading

CRAWL_EXIT = False
PARSE_EXIT = False

class ThreadCrawl(threading.Thread):
def __init__(self, threadName, pageQueue, dataQueue):
# threading.Thread.__init__(self)
super(ThreadCrawl, self).__init__()
self.threadName = threadName
self.pageQueue = pageQueue
self.dataQueue = dataQueue
self.headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}

def run(self):
print "启动 " + self.threadName
while not CRAWL_EXIT:
try:
page = self.pageQueue.get(False)
url = "https://tieba.baidu.com/f?kw=%E5%A6%B9%E5%AD%90&ie=utf-8&pn=" + str(page * 50)
content = requests.get(url, headers=self.headers).text
time.sleep(1)
self.dataQueue.put(content)
except:
pass

print "结束 " + self.threadName

class ThreadParse(threading.Thread):
def __init__(self, threadName, dataQueue, filename, lock):
super(ThreadParse, self).__init__()
self.threadName = threadName
self.dataQueue = dataQueue
self.fileName = filename
self.lock = lock

def run(self):
print "启动" + self.threadName
while not PARSE_EXIT:
try:
html = self.dataQueue.get(False)
self.parse(html)
except:
pass
print "退出" + self.threadName

def parse(self, html):
html = etree.HTML(html)
nodeList = html.xpath('//*[@id="thread_list"]//li/div/div[2]/div[1]/div[1]/a')
for title in nodeList:
items = {
"title" : title.text
}
with self.lock:
self.fileName.write(json.dumps(items, ensure_ascii=False).encode("utf-8") + "\n")

def main():
pageQueue = Queue(10)
for i in range(1, 11):
pageQueue.put(i)

dataQueue = Queue()

filename = open("duanzi.json", "a")

lock = threading.Lock()

# 三个采集线程的名字
crawlList = ["采集线程1号", "采集线程2号", "采集线程3号"]
# 存储三个采集线程的列表集合
threadcrawl = []
for threadName in crawlList:
thread = ThreadCrawl(threadName, pageQueue, dataQueue)
thread.start()
threadcrawl.append(thread)

# 三个解析线程的名字
parseList = ["解析线程1号", "解析线程2号", "解析线程3号"]
# 存储三个解析线程
threadparse = []
for threadName in parseList:
thread = ThreadParse(threadName, dataQueue, filename, lock)
thread.start()
threadparse.append(thread)

while not pageQueue.empty():
pass

# 如果pageQueue为空,采集线程退出循环
global CRAWL_EXIT
CRAWL_EXIT = True

print "pageQueue is empty"

for thread in threadcrawl:
thread.join()
print "1"

while not dataQueue.empty():
pass

global PARSE_EXIT
PARSE_EXIT = True

for thread in threadparse:
thread.join()
print "2"

with lock:
# 关闭文件
filename.close()
print "谢谢使用!"

if __name__ == "__main__":
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: