您的位置:首页 > 编程语言 > Python开发

python_慕课\Python开发简单爬虫\7-7 开始运行爬虫和爬取结果展.py

2017-12-04 11:46 597 查看
-- 7-3 URL管理器  https://www.imooc.com/video/10690 

-- D:\project_py\py_001\baike_spider\url_manager.py

'''

Created on 2017年12月4日

@author: Administrator

'''

class UrlManager(object):

    def __init__(self):

        self.new_urls=set()

        self.old_urls=set()

    def add_new_url(self,url):

        if url is None:

            return

        if url not in self.new_urls and  url not in self.old_urls:

            self.new_urls.add(url)

    def add_new_urls(self,urls):

        if urls is None or len(urls)==0:

            return

        for url in urls:

            self.add_new_url(url)

    def has_new_url(self):

        return  len(self.new_urls)!=0

    def get_new_url(self):

        new_url=self.new_urls.pop()

        self.old_urls.add(new_url)

        return  new_url

--  7-4 HTML下载器html_downloade  https://www.imooc.com/video/10691 

--  D:\project_py\py_001\baike_spider\html_downloader.py

import urllib2

class HtmlDownloader(object):

    def download(self,url):

        if url is None:

            return None

        response= urllib2.urlopen(url)

        if response.getcode()!=200:

            return  None

        return  response.read()

-- 7-5 HTML解析器html_parser  https://www.imooc.com/video/10692 

--  pycharm 中如何用快捷键自动import需要的库 https://segmentfault.com/q/1010000004340490  win: Alt + Enter

-- D:\project_py\py_001\baike_spider\html_parser.py

import urlparse

from bs4 import BeautifulSoup

import re

class HtmlParser( object ):

    def _get_new_urls(self, page_url, soup):

        new_urls = set()

        # /view/123.htm

        links = soup.find( 'a', href=re.compile( r"/view/\d+\.htm" ) )

        for link in links:

            new_url = link['href']

            new_full_url = urlparse.urljoin( page_url, new_url )

            new_urls.add( new_full_url )

        return new_urls

    def _get_new_data(self, page_url, soup):

        res_data = {}

        # url

        res_data['url'] = page_url

        # <dd class="lemmaWgt-lemmaTitle-title">  <h1>Python</h1>

        title_node = soup.find( 'dd', class_='lemmaWgt-lemmaTitle-title' ).find( 'h1' )

        res_data['title'] = title_node.get_text()

        # <div class="lemma-summary" label-module="lemmaSummary">

        summary_node = soup.find( 'div', class_='lemma-summary' )

        res_data['summary'] = summary_node.get_text()

        return res_data

    def parse(self, page_url, html_cont):

        if page_url is None or html_cont is None:

            return

        soup = BeautifulSoup( html_cont, 'html.parser', from_encoding='utf-8' )

        new_urls = self._get_new_urls( page_url, soup )

        new_data = self._get_new_data( page_url, soup )

        return new_urls, new_data

</div>

--  7-6 HTML输出器  https://www.imooc.com/video/10693 

-- D:\project_py\py_001\baike_spider\html_outputer.py

class HtmlOutputer( object ):

    def __init__(self):

        self.datas = []

    def collect_data(self, data):

        if data is None:

            return

        self.datas.append( data )

    def output_html(self):

        fout = open( 'output.html', 'w' )

        fout.write( '<html>' )

        fout.write( '<body>' )

        #ascii

        for data in self.datas:

            fout.write('<tr>')

            fout.write('<td>%s</td>'%data['url'])

            fout.write( '<td>%s</td>' % data['title'].encode('utf-8') )

            fout.write( '<td>%s</td>' % data['summary'].encode('utf-8') )

        fout.write( '</body>' )

        fout.write( '</html>' )

        fout.close()

-- 7-7 开始运行爬虫和爬取结果展  https://www.imooc.com/video/10694 
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: