您的位置:首页 > 编程语言 > Python开发

python_慕课\Python开发简单爬虫\5-3 Python爬虫urlib2实例代码.py

2017-12-04 11:45 961 查看
-- 3-1 Python简单爬虫架构   https://www.imooc.com/video/10677 
-- 3-2 Python简单爬虫架构的动态  https://www.imooc.com/video/10678

-- 4-1 Python爬虫URL管理 https://www.imooc.com/video/10679  

--  4-2 Python爬虫URL管理器的实 https://www.imooc.com/video/10680
-- 5-2 Python爬虫urlib2下载器网.  https://www.imooc.com/video/10682

-- 1 

#!encoding:utf-8

#urllib2下载网页方法1 :最简洁

import  urllib2

#直接请求

response=urllib2.urlopen('http://www.baidu.com')

#获取状态妈码 200表示获取成功

print response.getcode()

#读取内容

cont=response.read()

print cont

--  2

# !encoding:utf-8

# urllib2下载网页方法2:添加data、http header

import urllib2

# 创建requesst对象

request = urllib2.Request( "http://www.baidu.com" )

# 添加数据

request.add_data( 'a', '1' )

# 添加http的header

request.add_header( 'User-Agent', 'Mozila/5.0' )

#发送请求获取结果

respnse=urllib2.urlopen(request)

print respnse

-- 3

# !encoding:utf-8

# urllib2下载网页方法2:添加特殊情景的处理器

import urllib2,cookielib

#创建cookie容器

cj=cookielib.CookieJar()

#创建1个opener

opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

#给urllib2安装opener

urllib2.install_opener(opener)

#使用带有cookie的urllib2访问网页

response=urllib2.urlopen('http://www.baidu.com')

print response

-- 5-3 Python爬虫urlib2实例代码 https://www.imooc.com/video/10683
# !encoding:utf-8

import urllib2,cookielib

url='http://www.baidu.com'

# 1

response1 =urllib2.urlopen(url)

print  response1.getcode()

print len(response1.read())

#2

request=urllib2.Request(url)

request.add_header('user-agent','Mozila/5.0')

response2 =urllib2.urlopen(request)

print  response2.getcode()

print len( response2.read())

#3

cj=cookielib.CookieJar()

opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

urllib2.install_opener(opener)

response3 =urllib2.urlopen(url)

print  response3.getcode()

print len(response3.read())

print cj

-- 6-1 Python爬虫网页解析器简介  https://www.imooc.com/video/10684 

-- 6-2 BeautifulSoup模块介绍和.  https://www.imooc.com/video/10685 

 beautiful soup : python第三方库 用于在html 或 xml 中提取数据 官网 : http://www.crummy.com/software/BeautifuSoup/ 
 安装 : pip install beautifulsoup4 

 

 import bs4 不抱错就表示安装成功

 

--  6-3 BeautifulSoup的语法 https://www.imooc.com/video/10686 
搜索节点 find_all 、 find  访问节点名称、属性、文字

node.name :获取查找到的节点的标签名称 
node['href'] :获取查找到的a节点的href属性
node.get_text() :获取查找到的a节点的链接文字

--  6-4 BeautifulSoup实例测试. https://www.imooc.com/video/10687
# !encoding:utf-8

import urllib2, cookielib,re

from bs4 import BeautifulSoup

# copy 自 http://www.imooc.com/search/course?words=beatiful
html_doc = """

    <html>

<body >

<div id="header">

    <div class="page-container" id="nav"  >

         <div id="logo" class="logo"><a href="/" target="_self" class="hide-text" title="首页">慕课网</a></div>

      

        <button type="button" class="navbar-toggle visible-xs-block js-show-menu" >

            <i class="icon-menu"></i>

        </button>

       

        <div id="login-area">

            <ul class="clearfix logined">

                <li class="app-load" id="js-app-load">

                    <a href="//www.imooc.com/mobile/app" target="_blank" style="width:60px;">下载APP</a>

                    <div class="app-load-box clearfix js-load-box">

                        <img src="/static/img/common/appload.png" class="l">

                        <div class="r">

                            <p>扫描下载慕课网APP</p>

                            <a href="https://itunes.apple.com/cn/app/mu-ke-wang/id722179140?mt=8"><i class="icon-apple"></i>App Store下载</a>

                            <a href="//www.imooc.com/mobile/mukewang.apk"><i class="icon-android"></i>Android下载</a>

                        </div>

                    </div>

                </li>

                

                <li class="shop-cart" id="shop-cart">

                    <a href="http://order.imooc.com/pay/cart" class="shop-cart-icon" target="_blank">

                        <span class="icon-shopping-cart js-endcart"></span><span>购物车</span><span class="shopping_icon js-cart-num" data-ordernum="0"  data-cartnum="0" style='display: none'>0</span>

                    </a>

                    <div class="my-cart" id="js-my-cart"></div>

                </li>

                

                <li class='remind_warp'>

                    <i class="msg_remind"></i>

                    <a target="_blank" href='/u/2460583/notices'>

                        <i class='icon-notifi'></i>

                        <!-- <span class="msg_icon" style="display: none;"></span> -->

                    </a>

                </li>

               

                <li class="set_btn user-card-box" id='header-user-card'>

                    <a id="header-avator" class="user-card-item js-header-avator" action-type="my_menu"  href="/u/2460583" target="_self">

                        <img width="40" height="40">

                        <i class="myspace_remind" style="display: none;"></i>

                        <span style="display: none;">动态提醒</span>

                    </a>

                    <div class="g-user-card"></div>

                </li>

            </ul>

    </div>

</div>

</body>

</html>

"""

#1 

soup = BeautifulSoup( html_doc, 'html.parser', from_encoding='utf-8' )

print '获取所有链接'

links = soup.find_all( 'a' )

for link in links:

    print link.name, link['href'], link.get_text()

# 2 获取 http://order.imooc.com/pay/cart  这个url

link_node = soup.find( 'a', href='http://order.imooc.com/pay/cart' )

print link_node.name, link_node['href'], link_node.get_text()

#正则表达式

link_node = soup.find( 'a', href=re.compile(r'cart'))

print link_node.name, link_node['href'], link_node.get_text()

-- 7-1 Python爬虫实例-分析目标   https://w 4000
ww.imooc.com/video/10688 

--  7-2 调度程序  https://www.imooc.com/video/10689

-- D:\project_py\py_001\baike_spider\spider_main.py

'''

Created on 2017年12月4日

@author: Administrator

'''

from django.test.signals import root_urlconf_changed

from baike_spider import url_manager, html_downloader,html_parser,html_outputer

if __name__=="__main__":

    root_url="https://baike.baidu.com/item/Python/407313?fr=aladdin"

    obj_spider=SpiderMain()

    obj_spider.craw(root_url)

    

    

class SpiderMain(object):

    def __init__(self):  # @DontTrace

        self.urls=url_manager.UrlManager()

        self.downloader=html_downloader.HtmlDownloader()

        self.parser=html_parser.HtmlParser()

        self.outputer=html_outputer.HtmlOutputer()

    

    def craw(self,root_url):

        count=1

        self.urls.add_new_url(root_url)

        while self.urls.has_new_url():

            try:

                new_url=self.urls.get_new_url()

                print 'craw %d:%s'%(count,new_url)

                html_cont=self.downloader.download(new_url)

                new_urls,new_data=self.parser.parse(new_url,html_cont)

                self.urls.add_new_urls(new_urls)

                self.outputer.collect_data(new_data)

                if count==1000:

                    break

                count=count+1

            except:

                print 'craw failed'

        self.outputer.output_html()

-- D:\project_py\py_001\baike_spider\html_downloader.py

class HtmlDownloader(object):

    def download(self):

        pass

--  D:\project_py\py_001\baike_spider\html_outputer.py

class HtmlOutputer(object):

    def collect_data(self):

        pass

    def output_html(self):

        pass

-- D:\project_py\py_001\baike_spider\html_parser.py

class HtmlParser(object):

    def parse(self):

        pass

-- D:\project_py\py_001\baike_spider\url_manager.py

'''

Created on 2017年12月4日

@author: Administrator

'''

class UrlManager(object):

    def add_new_url(self):

        pass

    def has_new_url(self):

        pass

    def get_new_url(self):

        pass

    def add_new_urls(self):

        pass
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: