web测试常用python代码——爬虫程序
2012-04-26 14:02
573 查看
#coding=utf-8 #爬虫程序——起点 ''' Created on 2012-4-18 @author: xxx ''' import urllib2 import Queue import threading import time import socket import sgmllib urls = ['http://www.qidian.com/Book/%d.aspx/' % i for i in range(0, 1000000)] threadsNum = 100 #设置线程栈大小 threading.stack_size(32768 * 16) #设置连接超时 socket.setdefaulttimeout(10) class BookSpider(sgmllib.SGMLParser): def __init__(self, threadsNum): self.opener = urllib2.build_opener(urllib2.HTTPHandler) self.lock = threading.Lock() #请求队列 self.requestQueue = Queue.Queue() #完成队列 self.completeQueue = Queue.Queue() self.runThreadsNum = 0 for i in range(threadsNum): tmpThread = threading.Thread(target = self.threadRun) tmpThread.daemon = True tmpThread.start() def __del__(self): time.sleep(2) #等待两个队列结束 self.requestQueue.join() self.completeQueue.join() def taskLeft(self): return self.requestQueue.qsize() + self.completeQueue.qsize() + self.runThreadsNum def push(self, request): self.requestQueue.put(request) def pop(self): return self.completeQueue.get() def threadRun(self): while True: request = self.requestQueue.get() with self.lock: self.runThreadsNum += 1 try: result = self.opener.open(request).read() self.completeQueue.put((request, result)) except Exception: time.sleep(0.1) with self.lock: self.runThreadsNum -= 1 self.requestQueue.task_done() time.sleep(0.1) class MyParser(sgmllib.SGMLParser): def __init__(self): sgmllib.SGMLParser.__init__(self) self.text_meta = [] self.text_br = [] self.is_b = 0 def start_meta(self, attrs): for attr in attrs: if attr[0] == 'title': self.text_meta.append(attr[1]) def start_b(self, attrs): for attr in attrs: if (attr[0] == 'style') and (attr[1] == 'color:Red; display:none'): self.is_b = 1 def end_b(self): if self.is_b == 1: self.is_b = 2 def unknown_starttag(self, tag, attrs): if (self.is_b == 2) and (tag != 'br'): self.is_b = 0 def handle_data(self, text): if (self.is_b == 2): self.text_br.append(text.strip()) if __name__ == '__main__': spider = BookSpider(threadsNum) for url in urls: spider.push(url) while spider.taskLeft(): url, contents = spider.pop() myParser = MyParser() myParser.feed(contents) writeFile = open('qidian.txt', 'a') for i in myParser.text_meta: writeFile.write(url + '\n') writeFile.write(i + '\n') for i in myParser.text_br: writeFile.write(i + '\n') writeFile.close()
相关文章推荐
- web测试常用python代码——发包
- python-常用小程序-网页爬虫
- web测试常用python代码——ssh远程登陆以及命令执行
- web测试常用python代码——mysql连接以及语句执行
- 【python】100行代码python爬虫程序,抓取网站图片存储本地(附:中文链接解决)
- web测试常用python代码——mysql连接以及语句执行
- web测试常用python代码——ssh远程登陆以及命令执行
- web测试常用python代码——引入mysqldb及配置
- ASP.NET程序中常用代码汇总-2[转]
- ASP.NET程序中常用的三十三种代码
- ASP.NET程序中常用的三十三种代码
- python常用代码和函数实例
- Python之一些常用的爬虫技巧总结
- Dedecms程序SEO常用的列表标签调用代码集合 dedecms优化
- NET程序中常用的代码
- ASP.NET程序中常用的三十三种代码
- python3.4学习笔记(十四) 网络爬虫实例代码,抓取新浪爱彩双色球开奖数据实例
- 微信小程序开发常用技巧(9)——使用模板文件方便公共代码开发
- python爬虫抓手机号+java客户端小程序2