您的位置:首页 > 编程语言 > Python开发

web测试常用python代码——爬虫程序

2012-04-26 14:02 573 查看
#coding=utf-8
#爬虫程序——起点
'''
Created on 2012-4-18

@author: xxx
'''

import urllib2
import Queue
import threading
import time
import socket
import sgmllib

urls = ['http://www.qidian.com/Book/%d.aspx/' % i for i in range(0, 1000000)]
threadsNum = 100
#设置线程栈大小
threading.stack_size(32768 * 16)
#设置连接超时
socket.setdefaulttimeout(10)

class BookSpider(sgmllib.SGMLParser):
def __init__(self, threadsNum):
self.opener = urllib2.build_opener(urllib2.HTTPHandler)
self.lock = threading.Lock()
#请求队列
self.requestQueue = Queue.Queue()
#完成队列
self.completeQueue = Queue.Queue()
self.runThreadsNum = 0
for i in range(threadsNum):
tmpThread = threading.Thread(target = self.threadRun)
tmpThread.daemon = True
tmpThread.start()

def __del__(self):
time.sleep(2)
#等待两个队列结束
self.requestQueue.join()
self.completeQueue.join()

def taskLeft(self):
return self.requestQueue.qsize() + self.completeQueue.qsize() + self.runThreadsNum

def push(self, request):
self.requestQueue.put(request)

def pop(self):
return self.completeQueue.get()

def threadRun(self):
while True:
request = self.requestQueue.get()
with self.lock:
self.runThreadsNum += 1
try:
result = self.opener.open(request).read()
self.completeQueue.put((request, result))
except Exception:
time.sleep(0.1)
with self.lock:
self.runThreadsNum -= 1
self.requestQueue.task_done()
time.sleep(0.1)

class MyParser(sgmllib.SGMLParser):
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.text_meta = []
self.text_br = []
self.is_b = 0

def start_meta(self, attrs):
for attr in attrs:
if attr[0] == 'title':
self.text_meta.append(attr[1])

def start_b(self, attrs):
for attr in attrs:
if (attr[0] == 'style') and (attr[1] == 'color:Red; display:none'):
self.is_b = 1

def end_b(self):
if self.is_b == 1:
self.is_b = 2

def unknown_starttag(self, tag, attrs):
if (self.is_b == 2) and (tag != 'br'):
self.is_b = 0

def handle_data(self, text):
if (self.is_b == 2):
self.text_br.append(text.strip())

if __name__ == '__main__':
spider = BookSpider(threadsNum)
for url in urls:
spider.push(url)
while spider.taskLeft():
url, contents = spider.pop()
myParser = MyParser()
myParser.feed(contents)
writeFile = open('qidian.txt', 'a')
for i in myParser.text_meta:
writeFile.write(url + '\n')
writeFile.write(i + '\n')
for i in myParser.text_br:
writeFile.write(i + '\n')
writeFile.close()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: