您的位置:首页 > 编程语言 > Python开发

周末小爬虫

2017-03-25 00:00 155 查看
今天值班好像没什么事要干,趁机学学python

目标网站:http://wooyun.tangscan.cn(记得之前看的时候这个网站上还没有小广告,现在好多小广告)

import requests
import re
import threading

class Collector(object):
def __init__(self):
self.pageindex=[]
self.pagecontent=[]

class ContentParse(object):
def __init__(self):
self.pagerule=re.compile('totalPages:\s*(\d+)')
self.pageurlrule=re.compile(r'<a href=\"(?=static)([^"]+)\"\starget=\"_blank\">([\s\S]+?)</a>')
def getpage(self,strings):
return self.pagerule.findall(strings)
def getpageurl(self,strings):
return self.pageurlrule.findall(strings)

class Spider(threading.Thread):
def __init__(self,prefix,page='1'):
super(Spider, self).__init__()
self.prefix=prefix
self.page=page
def run(self):
strings=requests.get(self.prefix+self.page)
return strings.content

def main():
threads = []
prefix="http://wooyun.tangscan.cn/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page="
spider=Spider(prefix)
collector=Collector()
contentparse=ContentParse()
collector.pagecontent.append(spider.run())
pages=contentparse.getpage(collector.pagecontent[0])
for i in range(2,int(pages[0])+1):
threads.append(Spider(prefix,str(i)))
for thread in threads:
collector.pagecontent.append(thread.run())
for strings in collector.pagecontent:
collector.pageindex.extend(contentparse.getpageurl(strings))
for index in collector.pageindex:
print "url=%s,title=%s" % (index[0],index[1].strip())

main()

暂时就写了这么多,功能和性能还差好多→_→:

(1)应该在collector类中实现collect方法,而不是把解析出的内容直接填进去

(2)页面下载(最好能过滤掉html一些乱七八糟的标签)和日志记录功能都没有实现

(3)等到爬完整个网站才进行关键内容的提取,实际测试下来速度也很慢,如果能在下载的同时进行解析url和标题,打印日志的工作就好了

暂时就想到这么多,今晚或者明天写完这个爬虫。另外开源中国这个代码的配色好难看,有人知道在哪改吗......

改了下代码,现在可以在下载的时候解析页面中得目标url和标题了。还差日志打印和目标页面的下载工作。

import requests
import re
import threading
from Queue import Queue

class Collector(object):
def __init__(self):
self.url2title={}
def storage(self,name,strings):
for i in strings:
self.url2title[r"http://www.anquan.us/"+i[0]]=i[1].strip()

class ContentParse(threading.Thread):
def __init__(self,queue,collector):
super(ContentParse,self).__init__()
self.pageurlrule=re.compile(r'<a href=\"(?=static)([^"]+)\"\starget=\"_blank\">([\s\S]+?)</a>')
self.queue=queue
self.collector=collector
def getpageurl(self,strings=None):
return self.pageurlrule.findall(strings)
def run(self):
while True:
data=self.queue.get()
if data=='final':
break
else:
print "parseing..........."
self.collector.storage('url2title',self.pageurlrule.findall(data))

class Spider(threading.Thread):
def __init__(self,prefix,queue,page='1'):
super(Spider, self).__init__()
self.prefix=prefix
self.page=page
self.queue=queue
def run(self):
content=requests.get(self.prefix+self.page)
print "dowlanding.............."
self.queue.put(content.text)

def getpage(strings):
return re.findall('totalPages:\s*(\d+)',strings)

def main():
prefix=r"http://wooyun.tangscan.cn/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page="
queue=Queue()
collector = Collector()
spider=Spider(prefix,queue)
spider.run()
pages=getpage(queue.get())
contentparse = ContentParse(queue,collector)
threads=[]
for i in range(2,int(pages[0])+1):
threads.append(Spider(prefix,queue,str(i)))
for thread in threads:

3ff0
thread.start()
contentparse.start()
for thread in threads:
thread.join()
queue.put('final')

main()

字符编码的问题简直让人蛋疼的不行,先写到这儿吧。学点mongo了接着写。

#-*- encoding:utf-8 -*-
import requests
import re
import threading
import logging
from Queue import Queue

class Collector(object):
def __init__(self):
self.url2title={}
def storage(self,name,strings):
for i in strings:
self.url2title[r"http://www.anquan.us/"+i[0]]=i[1].strip()

class ContentParse(threading.Thread):
def __init__(self,queue,collector):
super(ContentParse,self).__init__()
self.pageurlrule=re.compile(r'<a href=\"(?=static)([^"]+)\"\starget=\"_blank\">([\s\S]+?)</a>')
self.queue=queue
self.collector=collector
def getpageurl(self,strings=None):
return self.pageurlrule.findall(strings)
def run(self):
while True:
try:
data,url=self.queue.get()
if data=='final':
break
else:
writelog("pasing %s" % (url))
self.collector.storage('url2title',self.pageurlrule.findall(data))
except Exception,e:
writelog(str(e))

class Spider(threading.Thread):
def __init__(self,prefix,queue,page='1'):
super(Spider, self).__init__()
self.prefix=prefix
self.page=page
self.queue=queue
def run(self):
try:
content=requests.get(self.prefix+self.page)
writelog("dowland %s" % self.prefix + self.page)
except Exception,e:
writelog(str(e))
self.queue.put((content.text,self.prefix+self.page))

def getpage(strings):
return re.findall('totalPages:\s*(\d+)',strings)

def writelog(message):
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%a %d %b %Y %H:%M:%S',
)
logging.info(message)

def main():
prefix=r"http://wooyun.tangscan.cn/search?keywords=app&&content_search_by=by_bugs&&search_by_html=False&&page="
queue=Queue()
collector = Collector()
spider=Spider(prefix,queue)
spider.run()
pages=getpage(queue.get()[0])
contentparse = ContentParse(queue,collector)
threads=[]
for i in range(2,int(pages[0])+1):
threads.append(Spider(prefix,queue,str(i)))
for thread in threads:
thread.start()
contentparse.start()
for thread in threads:
thread.join()
queue.put('final')
for key in collector.url2title.keys()
r=requests.get(key)
try:
r = requests.get(key)
r.close()
fp=open(collector.url2title[key].encode('gb2312')+'.html','w')
fp.write(r.text.encode(r.encoding).decode('utf8').encode('utf8'))
except Exception,e:
writelog(str(e))
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 爬虫