您的位置:首页 > 编程语言 > Python开发

python 多线程采集网页完善版

2013-04-03 17:25 344 查看
import threading,time,random,htmllib,urllib,formatter,string,re

def getPageNum(data):

result=re.findall( r'pageNum">共(\d+)' , data )

try:

return result[0]

except:

return 0

def geturls(cururl):

global l

data = urllib.urlopen(cururl).read()

linkdemo = GetLinks()

linkdemo.feed(data)

linkdemo.close()

urls=[]

for item in linkdemo.links:

try:

if item.index('/suppliers/') and item.startswith(cururl)==False:

urls.append(item)

except ValueError:

pass

cursize=getPageNum(data)

if int(cursize)>100:

cursize=100

output = open('datadown/listUrl.txt', 'a+')

output.write(cururl + "@@@@" + str(cursize) + "\n")

output.close()

for ni in urls:

try:

l.index(ni)

pass

except ValueError:

l.append(ni)

output = open('datadown/historyUrl.txt', 'a+')

output.write(ni + "\n")

output.close()

class GetLinks(htmllib.HTMLParser):

def __init__(self):

self.links = []

f = formatter.NullFormatter()

htmllib.HTMLParser.__init__(self, f)

def anchor_bgn(self, href, name, type):

self.save_bgn()

self.link = href

def anchor_end(self):

text = string.strip(self.save_end())

if self.link and text:

self.links.append(self.link)

mylock = threading.RLock()

num=0

l=[]

class myThread(threading.Thread):

def __init__(self, name):

threading.Thread.__init__(self)

self.t_name = name

def run(self):

global num,l

while True:

#同步开始

mylock.acquire()

num+=1

if len(l)==0:

curl="http://product.cn.china.cn/suppliers/%B9%E3%D6%DD/"

print '当前线程:%s,当前连接:%s,当前数量:%d\n'%(self.t_name, curl, num)

geturls(curl)

curl=l[num-1]

print '当前线程:%s,当前连接:%s,当前数量:%d\n'%(self.t_name, curl, num+1)

mylock.release()

#同步结束

geturls(curl)

if num>=len(l):

#mylock.release()

#print 'len(l):'+ str(len(self.web.l))

print "\n\nl:\n"

#for i in l:

#print i

break

def test():

for i in range(1,6):

threadi=myThread('A'+str(i))

threadi.start()

if __name__== '__main__':

test()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: