python 多线程采集网页完善版
2013-04-03 17:25
344 查看
import threading,time,random,htmllib,urllib,formatter,string,re
def getPageNum(data):
result=re.findall( r'pageNum">共(\d+)' , data )
try:
return result[0]
except:
return 0
def geturls(cururl):
global l
data = urllib.urlopen(cururl).read()
linkdemo = GetLinks()
linkdemo.feed(data)
linkdemo.close()
urls=[]
for item in linkdemo.links:
try:
if item.index('/suppliers/') and item.startswith(cururl)==False:
urls.append(item)
except ValueError:
pass
cursize=getPageNum(data)
if int(cursize)>100:
cursize=100
output = open('datadown/listUrl.txt', 'a+')
output.write(cururl + "@@@@" + str(cursize) + "\n")
output.close()
for ni in urls:
try:
l.index(ni)
pass
except ValueError:
l.append(ni)
output = open('datadown/historyUrl.txt', 'a+')
output.write(ni + "\n")
output.close()
class GetLinks(htmllib.HTMLParser):
def __init__(self):
self.links = []
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.link = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.link and text:
self.links.append(self.link)
mylock = threading.RLock()
num=0
l=[]
class myThread(threading.Thread):
def __init__(self, name):
threading.Thread.__init__(self)
self.t_name = name
def run(self):
global num,l
while True:
#同步开始
mylock.acquire()
num+=1
if len(l)==0:
curl="http://product.cn.china.cn/suppliers/%B9%E3%D6%DD/"
print '当前线程:%s,当前连接:%s,当前数量:%d\n'%(self.t_name, curl, num)
geturls(curl)
curl=l[num-1]
print '当前线程:%s,当前连接:%s,当前数量:%d\n'%(self.t_name, curl, num+1)
mylock.release()
#同步结束
geturls(curl)
if num>=len(l):
#mylock.release()
#print 'len(l):'+ str(len(self.web.l))
print "\n\nl:\n"
#for i in l:
#print i
break
def test():
for i in range(1,6):
threadi=myThread('A'+str(i))
threadi.start()
if __name__== '__main__':
test()
def getPageNum(data):
result=re.findall( r'pageNum">共(\d+)' , data )
try:
return result[0]
except:
return 0
def geturls(cururl):
global l
data = urllib.urlopen(cururl).read()
linkdemo = GetLinks()
linkdemo.feed(data)
linkdemo.close()
urls=[]
for item in linkdemo.links:
try:
if item.index('/suppliers/') and item.startswith(cururl)==False:
urls.append(item)
except ValueError:
pass
cursize=getPageNum(data)
if int(cursize)>100:
cursize=100
output = open('datadown/listUrl.txt', 'a+')
output.write(cururl + "@@@@" + str(cursize) + "\n")
output.close()
for ni in urls:
try:
l.index(ni)
pass
except ValueError:
l.append(ni)
output = open('datadown/historyUrl.txt', 'a+')
output.write(ni + "\n")
output.close()
class GetLinks(htmllib.HTMLParser):
def __init__(self):
self.links = []
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.link = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.link and text:
self.links.append(self.link)
mylock = threading.RLock()
num=0
l=[]
class myThread(threading.Thread):
def __init__(self, name):
threading.Thread.__init__(self)
self.t_name = name
def run(self):
global num,l
while True:
#同步开始
mylock.acquire()
num+=1
if len(l)==0:
curl="http://product.cn.china.cn/suppliers/%B9%E3%D6%DD/"
print '当前线程:%s,当前连接:%s,当前数量:%d\n'%(self.t_name, curl, num)
geturls(curl)
curl=l[num-1]
print '当前线程:%s,当前连接:%s,当前数量:%d\n'%(self.t_name, curl, num+1)
mylock.release()
#同步结束
geturls(curl)
if num>=len(l):
#mylock.release()
#print 'len(l):'+ str(len(self.web.l))
print "\n\nl:\n"
#for i in l:
#print i
break
def test():
for i in range(1,6):
threadi=myThread('A'+str(i))
threadi.start()
if __name__== '__main__':
test()
相关文章推荐
- python 多线程采集网页
- 用python写的多线程网页爬虫
- python beautifulsoup多线程分析抓取网页
- API例子:用Python驱动Firefox采集网页数据
- Python采集网页时正则表达式匹配换行符的问题
- python 多线程处理抓取网页
- Python 多线程抓取网页
- python实现多线程采集的2个代码例子
- C#多线程使用webbrowser实现采集动态网页的爬虫机器人
- php多线程采集网页数据-php采集网页-php爬虫视频教程8
- Python 多线程抓取网页 牛人 use raw socket implement http request great
- 发一个python写的多线程 代理服务器 抓取,保存,验证程序,希望喜欢python的朋友和我一起完善它
- API例子:用Python驱动Firefox采集网页数据
- Python网页信息采集:使用PhantomJS采集淘宝天猫商品内容
- Python3 多线程数据采集中的一些坑
- API例子:用Python驱动Firefox采集网页数据
- 使用Python的BeautifulSoup 类库采集网页内容
- Python网页信息采集:使用PhantomJS采集淘宝天猫商品内容
- python多线程实现抓取网页
- 通过python多线程访问网页