基于协程的异步爬虫
2017-07-19 19:44
405 查看
基于tornado框架的异步爬虫小例子:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# created by fhqplzj on 2017/07/19 下午5:48
import logging
import time
from datetime import timedelta
from urlparse import urljoin, urldefrag
from scrapy import Selector
from tornado.gen import coroutine, Return
from tornado.httpclient import AsyncHTTPClient
from tornado.ioloop import IOLoop
from tornado.queues import Queue
logging.basicConfig()
base_url = 'http://www.tornadoweb.org/en/stable/'
concurrency = 10
@coroutine
def get_links_from_url(url):
try:
response = yield AsyncHTTPClient().fetch(url)
print 'fetched %s' % url
html = response.body if isinstance(response.body, str) else response.body.decode()
urls = [urljoin(url, urldefrag(new_url)[0]) for new_url in get_links(html)]
except Exception as e:
print 'Exception: %s %s' % (e, url)
raise Return([])
raise Return(urls)
def get_links(html):
return Selector(text=html).xpath('//a/@href').extract()
@coroutine
def main():
q = Queue()
start = time.time()
# fetching: 已经抓的和正在抓的
# fetched: 已经抓的
fetching, fetched = set(), set()
@coroutine
def fetch_url():
current_url = yield q.get()
try:
if current_url in fetching:
return
print 'fetching %s' % current_url
fetching.add(current_url)
urls = yield get_links_from_url(current_url)
fetched.add(current_url)
for new_url in urls:
if new_url.startswith(base_url):
yield q.put(new_url)
finally:
q.task_done()
@coroutine
def worker():
while True:
yield fetch_url()
q.put(base_url)
for _ in range(concurrency):
worker()
yield q.join(timeout=timedelta(seconds=300))
assert fetching == fetched
print 'Done in %d seconds, fetched %s URLs.' % (time.time() - start, len(fetched))
if __name__ == '__main__':
IOLoop.current().run_sync(main)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# created by fhqplzj on 2017/07/19 下午5:48
import logging
import time
from datetime import timedelta
from urlparse import urljoin, urldefrag
from scrapy import Selector
from tornado.gen import coroutine, Return
from tornado.httpclient import AsyncHTTPClient
from tornado.ioloop import IOLoop
from tornado.queues import Queue
logging.basicConfig()
base_url = 'http://www.tornadoweb.org/en/stable/'
concurrency = 10
@coroutine
def get_links_from_url(url):
try:
response = yield AsyncHTTPClient().fetch(url)
print 'fetched %s' % url
html = response.body if isinstance(response.body, str) else response.body.decode()
urls = [urljoin(url, urldefrag(new_url)[0]) for new_url in get_links(html)]
except Exception as e:
print 'Exception: %s %s' % (e, url)
raise Return([])
raise Return(urls)
def get_links(html):
return Selector(text=html).xpath('//a/@href').extract()
@coroutine
def main():
q = Queue()
start = time.time()
# fetching: 已经抓的和正在抓的
# fetched: 已经抓的
fetching, fetched = set(), set()
@coroutine
def fetch_url():
current_url = yield q.get()
try:
if current_url in fetching:
return
print 'fetching %s' % current_url
fetching.add(current_url)
urls = yield get_links_from_url(current_url)
fetched.add(current_url)
for new_url in urls:
if new_url.startswith(base_url):
yield q.put(new_url)
finally:
q.task_done()
@coroutine
def worker():
while True:
yield fetch_url()
q.put(base_url)
for _ in range(concurrency):
worker()
yield q.join(timeout=timedelta(seconds=300))
assert fetching == fetched
print 'Done in %d seconds, fetched %s URLs.' % (time.time() - start, len(fetched))
if __name__ == '__main__':
IOLoop.current().run_sync(main)
相关文章推荐
- Python实现基于协程的异步爬虫
- Python实现基于协程的异步爬虫
- Python实现基于协程的异步爬虫(一)
- Python实现基于协程的异步爬虫(一)
- Python实现基于协程的异步爬虫
- 基于asyncio 异步协程框架实现收集B站直播弹幕
- 基于asyncio 异步协程框架实现收集B站直播弹幕
- 第二章: 基于asyncio的异步爬虫迷你框架 Engine
- 第一章: 基于asyncio的异步爬虫迷你框架 Downloader
- 基于MFC的socket编程(异步非阻塞通信)
- [你必须知道的异步编程]——基于任务的异步模式
- 基于JavaScript、Javabean、Servlet、ajax的异步请求登录注册找回密码Javaweb项目
- 基于react16 webpack3 搭建前端spa基础框架 react-router的4种异步加载方式
- python爬虫日志(3)-爬取异步加载网页
- libco和tornado、协程和异步的一些理解
- Swift3.0基于原生异步回调HTTP+JSON
- c#中异步基于消息通信的完成端口的TCP/IP协议的组件实现(源代码) 客户端
- Boost.Asio C++ 网络编程之十:基于TCP的异步服务端
- 爬虫实战:基于 HtmlParser 实现网页链接的提取
- 一个基于python的数据爬虫