Python网页爬虫
2012-05-03 11:11
253 查看
1:
2:
#! /usr/bin/env python # -*- coding: utf-8 -*- #filename:splider.py #author:wfu(fuweilin@hotmail.com) from spdUtility import PriorityQueue,Parser import urllib2 import sys import os def updatePriQueue( priQueue, url ): "更新优先级队列" extraPrior = url.endswith('.html') and 2 or 0 #这里优先下载以html结尾的url extraMyBlog = 'www.kgblog.net' in url and 5 or 0 #优先抓取含有指定内容的网页,竞价抓取排名?? item = priQueue.getitem(url) if item : newitem = ( item[0]+1+extraPrior+extraMyBlog, item[1] ) priQueue.remove(item) priQueue.push( newitem ) else : priQueue.push( (1+extraPrior+extraMyBlog,url) ) def getmainurl(url): "获得该url的主站地址,用于添加在相对url地址的开头" ix = url.find('/',len('http://') ) if ix > 0 : return url[:ix] else : return url def analyseHtml(url,html, priQueue,downlist): "分析html的超链接,并更新优先级队列" p = Parser() try : p.feed(html) p.close() except: return mainurl = getmainurl(url) for k, v in p.anchors.items(): for u in v : if not u.startswith('http://'): #处理相对地址的url u = mainurl + u if not downlist.count(u) : #如果该url已经下载,就不处理了 updatePriQueue( priQueue, u ) def downloadUrl(id, url, priQueue , downlist,downFolder): "下载指定url内容,并分析html超链接" downFileName = downFolder+'/%d.html' % (id,) print 'downloading',url,'as', downFileName , try: fp = urllib2.urlopen(url) except: print '[ failed ]' return False else : print '[ success ]' downlist.push( url ) #把已下载的url添加到列表中 op = open(downFileName,"wb") html = fp.read() unicode(html,"gb18030","ignore").encode("utf8"); op.write( html ) op.close() fp.close() analyseHtml(url,html,priQueue,downlist) return True def spider(beginurl, pages,downFolder): "爬虫主程序,循环从优先级队列中取出最高优先级的结点处理" priQueue = PriorityQueue() downlist = PriorityQueue() #已下载url的集合,防止重复下载 priQueue.push( (1,beginurl) ) i = 0 while not priQueue.empty() and i < pages : k, url = priQueue.pop() if downloadUrl(i+1, url, priQueue , downlist,downFolder): i += 1 print '\nDownload',i,'pages, Totally.' def main(): "主函数,设定相关参数:开始url,抓取的网页数目,保存的文件夹" beginurl = 'http://www.csdn.net' #开始抓取的URL地址 pages = 10 #抓取网页的数目 downloadFolder = './down' #指定保存网页的文件夹 if not os.path.isdir( downloadFolder ): os.mkdir( downloadFolder ) spider( beginurl, pages, downloadFolder) if __name__ == '__main__': main()
2:
#! /usr/bin/env python # -*- coding: utf-8 -*- #filename:spdUtility.py #author:wfu(fuweilin@hotmail.com) import bisect import string import htmllib import formatter class PriorityQueue(list): "优先级队列,用于存储url,及它的优先级" def __init__(self): list.__init__(self) self.map = {} def push(self, item): # 按顺序插入,防止重复元素;若要按升序排列,可使用bisect.insort_left if self.count(item) == 0: bisect.insort(self, item) self.map[ item[1] ] = item def pop(self): r = list.pop(self) del self.map[ r[1] ] return r def getitem(self,url): if self.map.has_key( url ): return self.map[url] else : return None def empty(self): return len(self) == 0 def remove(self,item): list.remove(self, item) del self.map[ item[1] ] def count(self,item): if len(self) == 0 : return 0 #二分查找 left = 0 right = len(self)-1 mid = -1 while left <= right: mid = (left+right)/2 if self[mid] < item : left = mid + 1 elif self[mid] > item : right = mid -1 else : break return self[mid] == item and 1 or 0 class Parser(htmllib.HTMLParser): #HTML分析类 def __init__(self, verbose=0): self.anchors = {} f = formatter.NullFormatter() htmllib.HTMLParser.__init__(self, f, verbose) def anchor_bgn(self, href, name, type): self.save_bgn() self.anchor = href def anchor_end(self): text = string.strip(self.save_end()) if self.anchor and text: self.anchors[text] = self.anchors.get(text, []) + [self.anchor] def main(): #just for test pq = PriorityQueue() # add items out of order pq.push( (1,'http://www.baidu.com') ) pq.push( (2,'http://www.sina.com') ) pq.push( (3,'http://www.google.com') ) pq.push( (1,'http://www.163.com') ) item = pq.getitem('http://www.sina.com') print item print pq.count(item) pq.remove( item ) print pq.count(item) # print queue contents while not pq.empty(): print pq.pop() if __name__ == '__main__': main()
相关文章推荐
- Python 学习入门(6)—— 网页爬虫
- python开发爬虫----urllib2下载网页方法
- python抓取网页图片示例(python爬虫)
- Python爬虫之处理带Ajax、Js的网页
- Python网页爬虫学习
- python网络爬虫之使用scrapy自动爬取多个网页
- python3爬虫1--简单网页源代码获取
- Python爬虫实现抓取网页图片
- Python天气预报采集器(网页爬虫)
- Python爬虫,用第三方库解决下载网页中文本的问题
- 【Python】Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
- python3 爬虫日记(三) 爬取堆糖动态加载网页
- python写的网页爬虫-scrapy
- 第三百五十节,Python分布式爬虫打造搜索引擎Scrapy精讲—selenium模块是一个python操作浏览器软件的一个模块,可以实现js动态网页请求
- 分享:Python3伪装浏览器爬虫读取网页内容
- python 网页爬虫+保存图片+多线程+网络代理
- python网页爬虫代理
- python爬虫(1)下载任意网页图片
- Python3简单爬虫抓取网页图片
- python 网页爬虫(URL)