python_慕课\Python开发简单爬虫\7-7 开始运行爬虫和爬取结果展.py
2017-12-04 11:46
597 查看
-- 7-3 URL管理器 https://www.imooc.com/video/10690
-- D:\project_py\py_001\baike_spider\url_manager.py
'''
Created on 2017年12月4日
@author: Administrator
'''
class UrlManager(object):
def __init__(self):
self.new_urls=set()
self.old_urls=set()
def add_new_url(self,url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
if urls is None or len(urls)==0:
return
for url in urls:
self.add_new_url(url)
def has_new_url(self):
return len(self.new_urls)!=0
def get_new_url(self):
new_url=self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
-- 7-4 HTML下载器html_downloade https://www.imooc.com/video/10691
-- D:\project_py\py_001\baike_spider\html_downloader.py
import urllib2
class HtmlDownloader(object):
def download(self,url):
if url is None:
return None
response= urllib2.urlopen(url)
if response.getcode()!=200:
return None
return response.read()
-- 7-5 HTML解析器html_parser https://www.imooc.com/video/10692
-- pycharm 中如何用快捷键自动import需要的库 https://segmentfault.com/q/1010000004340490 win: Alt + Enter
-- D:\project_py\py_001\baike_spider\html_parser.py
import urlparse
from bs4 import BeautifulSoup
import re
class HtmlParser( object ):
def _get_new_urls(self, page_url, soup):
new_urls = set()
# /view/123.htm
links = soup.find( 'a', href=re.compile( r"/view/\d+\.htm" ) )
for link in links:
new_url = link['href']
new_full_url = urlparse.urljoin( page_url, new_url )
new_urls.add( new_full_url )
return new_urls
def _get_new_data(self, page_url, soup):
res_data = {}
# url
res_data['url'] = page_url
# <dd class="lemmaWgt-lemmaTitle-title"> <h1>Python</h1>
title_node = soup.find( 'dd', class_='lemmaWgt-lemmaTitle-title' ).find( 'h1' )
res_data['title'] = title_node.get_text()
# <div class="lemma-summary" label-module="lemmaSummary">
summary_node = soup.find( 'div', class_='lemma-summary' )
res_data['summary'] = summary_node.get_text()
return res_data
def parse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup( html_cont, 'html.parser', from_encoding='utf-8' )
new_urls = self._get_new_urls( page_url, soup )
new_data = self._get_new_data( page_url, soup )
return new_urls, new_data
</div>
-- 7-6 HTML输出器 https://www.imooc.com/video/10693
-- D:\project_py\py_001\baike_spider\html_outputer.py
class HtmlOutputer( object ):
def __init__(self):
self.datas = []
def collect_data(self, data):
if data is None:
return
self.datas.append( data )
def output_html(self):
fout = open( 'output.html', 'w' )
fout.write( '<html>' )
fout.write( '<body>' )
#ascii
for data in self.datas:
fout.write('<tr>')
fout.write('<td>%s</td>'%data['url'])
fout.write( '<td>%s</td>' % data['title'].encode('utf-8') )
fout.write( '<td>%s</td>' % data['summary'].encode('utf-8') )
fout.write( '</body>' )
fout.write( '</html>' )
fout.close()
-- 7-7 开始运行爬虫和爬取结果展 https://www.imooc.com/video/10694
-- D:\project_py\py_001\baike_spider\url_manager.py
'''
Created on 2017年12月4日
@author: Administrator
'''
class UrlManager(object):
def __init__(self):
self.new_urls=set()
self.old_urls=set()
def add_new_url(self,url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
if urls is None or len(urls)==0:
return
for url in urls:
self.add_new_url(url)
def has_new_url(self):
return len(self.new_urls)!=0
def get_new_url(self):
new_url=self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
-- 7-4 HTML下载器html_downloade https://www.imooc.com/video/10691
-- D:\project_py\py_001\baike_spider\html_downloader.py
import urllib2
class HtmlDownloader(object):
def download(self,url):
if url is None:
return None
response= urllib2.urlopen(url)
if response.getcode()!=200:
return None
return response.read()
-- 7-5 HTML解析器html_parser https://www.imooc.com/video/10692
-- pycharm 中如何用快捷键自动import需要的库 https://segmentfault.com/q/1010000004340490 win: Alt + Enter
-- D:\project_py\py_001\baike_spider\html_parser.py
import urlparse
from bs4 import BeautifulSoup
import re
class HtmlParser( object ):
def _get_new_urls(self, page_url, soup):
new_urls = set()
# /view/123.htm
links = soup.find( 'a', href=re.compile( r"/view/\d+\.htm" ) )
for link in links:
new_url = link['href']
new_full_url = urlparse.urljoin( page_url, new_url )
new_urls.add( new_full_url )
return new_urls
def _get_new_data(self, page_url, soup):
res_data = {}
# url
res_data['url'] = page_url
# <dd class="lemmaWgt-lemmaTitle-title"> <h1>Python</h1>
title_node = soup.find( 'dd', class_='lemmaWgt-lemmaTitle-title' ).find( 'h1' )
res_data['title'] = title_node.get_text()
# <div class="lemma-summary" label-module="lemmaSummary">
summary_node = soup.find( 'div', class_='lemma-summary' )
res_data['summary'] = summary_node.get_text()
return res_data
def parse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup( html_cont, 'html.parser', from_encoding='utf-8' )
new_urls = self._get_new_urls( page_url, soup )
new_data = self._get_new_data( page_url, soup )
return new_urls, new_data
</div>
-- 7-6 HTML输出器 https://www.imooc.com/video/10693
-- D:\project_py\py_001\baike_spider\html_outputer.py
class HtmlOutputer( object ):
def __init__(self):
self.datas = []
def collect_data(self, data):
if data is None:
return
self.datas.append( data )
def output_html(self):
fout = open( 'output.html', 'w' )
fout.write( '<html>' )
fout.write( '<body>' )
#ascii
for data in self.datas:
fout.write('<tr>')
fout.write('<td>%s</td>'%data['url'])
fout.write( '<td>%s</td>' % data['title'].encode('utf-8') )
fout.write( '<td>%s</td>' % data['summary'].encode('utf-8') )
fout.write( '</body>' )
fout.write( '</html>' )
fout.close()
-- 7-7 开始运行爬虫和爬取结果展 https://www.imooc.com/video/10694
相关文章推荐
- python_慕课\Python开发简单爬虫\5-3 Python爬虫urlib2实例代码.py
- Python开发简单爬虫(二)---爬取百度百科页面数据
- 2---python开发简单爬虫
- ”Python开发简单爬虫“慕课网课程学习笔记1
- 如何开始写你的第一个python脚本——简单爬虫入门!
- Python开发简单爬虫 - 慕课网
- Python开发简单爬虫(二)
- Python开发简单爬虫
- ”Python开发简单爬虫“慕课网课程学习笔记1
- Python开发简单爬虫学习笔记(1)
- Python开发简单爬虫
- Python爬虫实战入门二:从一个简单的HTTP请求开始
- Python开发简单爬虫(一)
- python开发简单爬虫:实战篇
- Python开发简单爬虫
- Python开发简单爬虫(根据慕课网视频课程整理)
- Python开发简单爬虫
- python 开发简单爬虫
- 实践项目十:爬取百度百科Python词条相关1000个页面数据(慕课简单爬虫实战)
- 基础爬虫框架及运行(选自范传辉Python爬虫开发与项目实战)