python_慕课\Python开发简单爬虫\5-3 Python爬虫urlib2实例代码.py
2017-12-04 11:45
961 查看
-- 3-1 Python简单爬虫架构 https://www.imooc.com/video/10677
-- 3-2 Python简单爬虫架构的动态 https://www.imooc.com/video/10678
-- 4-1 Python爬虫URL管理 https://www.imooc.com/video/10679
-- 4-2 Python爬虫URL管理器的实 https://www.imooc.com/video/10680
-- 5-2 Python爬虫urlib2下载器网. https://www.imooc.com/video/10682
-- 1
#!encoding:utf-8
#urllib2下载网页方法1 :最简洁
import urllib2
#直接请求
response=urllib2.urlopen('http://www.baidu.com')
#获取状态妈码 200表示获取成功
print response.getcode()
#读取内容
cont=response.read()
print cont
-- 2
# !encoding:utf-8
# urllib2下载网页方法2:添加data、http header
import urllib2
# 创建requesst对象
request = urllib2.Request( "http://www.baidu.com" )
# 添加数据
request.add_data( 'a', '1' )
# 添加http的header
request.add_header( 'User-Agent', 'Mozila/5.0' )
#发送请求获取结果
respnse=urllib2.urlopen(request)
print respnse
-- 3
# !encoding:utf-8
# urllib2下载网页方法2:添加特殊情景的处理器
import urllib2,cookielib
#创建cookie容器
cj=cookielib.CookieJar()
#创建1个opener
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
#给urllib2安装opener
urllib2.install_opener(opener)
#使用带有cookie的urllib2访问网页
response=urllib2.urlopen('http://www.baidu.com')
print response
-- 5-3 Python爬虫urlib2实例代码 https://www.imooc.com/video/10683
# !encoding:utf-8
import urllib2,cookielib
url='http://www.baidu.com'
# 1
response1 =urllib2.urlopen(url)
print response1.getcode()
print len(response1.read())
#2
request=urllib2.Request(url)
request.add_header('user-agent','Mozila/5.0')
response2 =urllib2.urlopen(request)
print response2.getcode()
print len( response2.read())
#3
cj=cookielib.CookieJar()
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
response3 =urllib2.urlopen(url)
print response3.getcode()
print len(response3.read())
print cj
-- 6-1 Python爬虫网页解析器简介 https://www.imooc.com/video/10684
-- 6-2 BeautifulSoup模块介绍和. https://www.imooc.com/video/10685
beautiful soup : python第三方库 用于在html 或 xml 中提取数据 官网 : http://www.crummy.com/software/BeautifuSoup/
安装 : pip install beautifulsoup4
import bs4 不抱错就表示安装成功
-- 6-3 BeautifulSoup的语法 https://www.imooc.com/video/10686
搜索节点 find_all 、 find 访问节点名称、属性、文字
node.name :获取查找到的节点的标签名称
node['href'] :获取查找到的a节点的href属性
node.get_text() :获取查找到的a节点的链接文字
-- 6-4 BeautifulSoup实例测试. https://www.imooc.com/video/10687
# !encoding:utf-8
import urllib2, cookielib,re
from bs4 import BeautifulSoup
# copy 自 http://www.imooc.com/search/course?words=beatiful
html_doc = """
<html>
<body >
<div id="header">
<div class="page-container" id="nav" >
<div id="logo" class="logo"><a href="/" target="_self" class="hide-text" title="首页">慕课网</a></div>
<button type="button" class="navbar-toggle visible-xs-block js-show-menu" >
<i class="icon-menu"></i>
</button>
<div id="login-area">
<ul class="clearfix logined">
<li class="app-load" id="js-app-load">
<a href="//www.imooc.com/mobile/app" target="_blank" style="width:60px;">下载APP</a>
<div class="app-load-box clearfix js-load-box">
<img src="/static/img/common/appload.png" class="l">
<div class="r">
<p>扫描下载慕课网APP</p>
<a href="https://itunes.apple.com/cn/app/mu-ke-wang/id722179140?mt=8"><i class="icon-apple"></i>App Store下载</a>
<a href="//www.imooc.com/mobile/mukewang.apk"><i class="icon-android"></i>Android下载</a>
</div>
</div>
</li>
<li class="shop-cart" id="shop-cart">
<a href="http://order.imooc.com/pay/cart" class="shop-cart-icon" target="_blank">
<span class="icon-shopping-cart js-endcart"></span><span>购物车</span><span class="shopping_icon js-cart-num" data-ordernum="0" data-cartnum="0" style='display: none'>0</span>
</a>
<div class="my-cart" id="js-my-cart"></div>
</li>
<li class='remind_warp'>
<i class="msg_remind"></i>
<a target="_blank" href='/u/2460583/notices'>
<i class='icon-notifi'></i>
<!-- <span class="msg_icon" style="display: none;"></span> -->
</a>
</li>
<li class="set_btn user-card-box" id='header-user-card'>
<a id="header-avator" class="user-card-item js-header-avator" action-type="my_menu" href="/u/2460583" target="_self">
<img width="40" height="40">
<i class="myspace_remind" style="display: none;"></i>
<span style="display: none;">动态提醒</span>
</a>
<div class="g-user-card"></div>
</li>
</ul>
</div>
</div>
</body>
</html>
"""
#1
soup = BeautifulSoup( html_doc, 'html.parser', from_encoding='utf-8' )
print '获取所有链接'
links = soup.find_all( 'a' )
for link in links:
print link.name, link['href'], link.get_text()
# 2 获取 http://order.imooc.com/pay/cart 这个url
link_node = soup.find( 'a', href='http://order.imooc.com/pay/cart' )
print link_node.name, link_node['href'], link_node.get_text()
#正则表达式
link_node = soup.find( 'a', href=re.compile(r'cart'))
print link_node.name, link_node['href'], link_node.get_text()
-- 7-1 Python爬虫实例-分析目标 https://w 4000
ww.imooc.com/video/10688
-- 7-2 调度程序 https://www.imooc.com/video/10689
-- D:\project_py\py_001\baike_spider\spider_main.py
'''
Created on 2017年12月4日
@author: Administrator
'''
from django.test.signals import root_urlconf_changed
from baike_spider import url_manager, html_downloader,html_parser,html_outputer
if __name__=="__main__":
root_url="https://baike.baidu.com/item/Python/407313?fr=aladdin"
obj_spider=SpiderMain()
obj_spider.craw(root_url)
class SpiderMain(object):
def __init__(self): # @DontTrace
self.urls=url_manager.UrlManager()
self.downloader=html_downloader.HtmlDownloader()
self.parser=html_parser.HtmlParser()
self.outputer=html_outputer.HtmlOutputer()
def craw(self,root_url):
count=1
self.urls.add_new_url(root_url)
while self.urls.has_new_url():
try:
new_url=self.urls.get_new_url()
print 'craw %d:%s'%(count,new_url)
html_cont=self.downloader.download(new_url)
new_urls,new_data=self.parser.parse(new_url,html_cont)
self.urls.add_new_urls(new_urls)
self.outputer.collect_data(new_data)
if count==1000:
break
count=count+1
except:
print 'craw failed'
self.outputer.output_html()
-- D:\project_py\py_001\baike_spider\html_downloader.py
class HtmlDownloader(object):
def download(self):
pass
-- D:\project_py\py_001\baike_spider\html_outputer.py
class HtmlOutputer(object):
def collect_data(self):
pass
def output_html(self):
pass
-- D:\project_py\py_001\baike_spider\html_parser.py
class HtmlParser(object):
def parse(self):
pass
-- D:\project_py\py_001\baike_spider\url_manager.py
'''
Created on 2017年12月4日
@author: Administrator
'''
class UrlManager(object):
def add_new_url(self):
pass
def has_new_url(self):
pass
def get_new_url(self):
pass
def add_new_urls(self):
pass
-- 3-2 Python简单爬虫架构的动态 https://www.imooc.com/video/10678
-- 4-1 Python爬虫URL管理 https://www.imooc.com/video/10679
-- 4-2 Python爬虫URL管理器的实 https://www.imooc.com/video/10680
-- 5-2 Python爬虫urlib2下载器网. https://www.imooc.com/video/10682
-- 1
#!encoding:utf-8
#urllib2下载网页方法1 :最简洁
import urllib2
#直接请求
response=urllib2.urlopen('http://www.baidu.com')
#获取状态妈码 200表示获取成功
print response.getcode()
#读取内容
cont=response.read()
print cont
-- 2
# !encoding:utf-8
# urllib2下载网页方法2:添加data、http header
import urllib2
# 创建requesst对象
request = urllib2.Request( "http://www.baidu.com" )
# 添加数据
request.add_data( 'a', '1' )
# 添加http的header
request.add_header( 'User-Agent', 'Mozila/5.0' )
#发送请求获取结果
respnse=urllib2.urlopen(request)
print respnse
-- 3
# !encoding:utf-8
# urllib2下载网页方法2:添加特殊情景的处理器
import urllib2,cookielib
#创建cookie容器
cj=cookielib.CookieJar()
#创建1个opener
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
#给urllib2安装opener
urllib2.install_opener(opener)
#使用带有cookie的urllib2访问网页
response=urllib2.urlopen('http://www.baidu.com')
print response
-- 5-3 Python爬虫urlib2实例代码 https://www.imooc.com/video/10683
# !encoding:utf-8
import urllib2,cookielib
url='http://www.baidu.com'
# 1
response1 =urllib2.urlopen(url)
print response1.getcode()
print len(response1.read())
#2
request=urllib2.Request(url)
request.add_header('user-agent','Mozila/5.0')
response2 =urllib2.urlopen(request)
print response2.getcode()
print len( response2.read())
#3
cj=cookielib.CookieJar()
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
response3 =urllib2.urlopen(url)
print response3.getcode()
print len(response3.read())
print cj
-- 6-1 Python爬虫网页解析器简介 https://www.imooc.com/video/10684
-- 6-2 BeautifulSoup模块介绍和. https://www.imooc.com/video/10685
beautiful soup : python第三方库 用于在html 或 xml 中提取数据 官网 : http://www.crummy.com/software/BeautifuSoup/
安装 : pip install beautifulsoup4
import bs4 不抱错就表示安装成功
-- 6-3 BeautifulSoup的语法 https://www.imooc.com/video/10686
搜索节点 find_all 、 find 访问节点名称、属性、文字
node.name :获取查找到的节点的标签名称
node['href'] :获取查找到的a节点的href属性
node.get_text() :获取查找到的a节点的链接文字
-- 6-4 BeautifulSoup实例测试. https://www.imooc.com/video/10687
# !encoding:utf-8
import urllib2, cookielib,re
from bs4 import BeautifulSoup
# copy 自 http://www.imooc.com/search/course?words=beatiful
html_doc = """
<html>
<body >
<div id="header">
<div class="page-container" id="nav" >
<div id="logo" class="logo"><a href="/" target="_self" class="hide-text" title="首页">慕课网</a></div>
<button type="button" class="navbar-toggle visible-xs-block js-show-menu" >
<i class="icon-menu"></i>
</button>
<div id="login-area">
<ul class="clearfix logined">
<li class="app-load" id="js-app-load">
<a href="//www.imooc.com/mobile/app" target="_blank" style="width:60px;">下载APP</a>
<div class="app-load-box clearfix js-load-box">
<img src="/static/img/common/appload.png" class="l">
<div class="r">
<p>扫描下载慕课网APP</p>
<a href="https://itunes.apple.com/cn/app/mu-ke-wang/id722179140?mt=8"><i class="icon-apple"></i>App Store下载</a>
<a href="//www.imooc.com/mobile/mukewang.apk"><i class="icon-android"></i>Android下载</a>
</div>
</div>
</li>
<li class="shop-cart" id="shop-cart">
<a href="http://order.imooc.com/pay/cart" class="shop-cart-icon" target="_blank">
<span class="icon-shopping-cart js-endcart"></span><span>购物车</span><span class="shopping_icon js-cart-num" data-ordernum="0" data-cartnum="0" style='display: none'>0</span>
</a>
<div class="my-cart" id="js-my-cart"></div>
</li>
<li class='remind_warp'>
<i class="msg_remind"></i>
<a target="_blank" href='/u/2460583/notices'>
<i class='icon-notifi'></i>
<!-- <span class="msg_icon" style="display: none;"></span> -->
</a>
</li>
<li class="set_btn user-card-box" id='header-user-card'>
<a id="header-avator" class="user-card-item js-header-avator" action-type="my_menu" href="/u/2460583" target="_self">
<img width="40" height="40">
<i class="myspace_remind" style="display: none;"></i>
<span style="display: none;">动态提醒</span>
</a>
<div class="g-user-card"></div>
</li>
</ul>
</div>
</div>
</body>
</html>
"""
#1
soup = BeautifulSoup( html_doc, 'html.parser', from_encoding='utf-8' )
print '获取所有链接'
links = soup.find_all( 'a' )
for link in links:
print link.name, link['href'], link.get_text()
# 2 获取 http://order.imooc.com/pay/cart 这个url
link_node = soup.find( 'a', href='http://order.imooc.com/pay/cart' )
print link_node.name, link_node['href'], link_node.get_text()
#正则表达式
link_node = soup.find( 'a', href=re.compile(r'cart'))
print link_node.name, link_node['href'], link_node.get_text()
-- 7-1 Python爬虫实例-分析目标 https://w 4000
ww.imooc.com/video/10688
-- 7-2 调度程序 https://www.imooc.com/video/10689
-- D:\project_py\py_001\baike_spider\spider_main.py
'''
Created on 2017年12月4日
@author: Administrator
'''
from django.test.signals import root_urlconf_changed
from baike_spider import url_manager, html_downloader,html_parser,html_outputer
if __name__=="__main__":
root_url="https://baike.baidu.com/item/Python/407313?fr=aladdin"
obj_spider=SpiderMain()
obj_spider.craw(root_url)
class SpiderMain(object):
def __init__(self): # @DontTrace
self.urls=url_manager.UrlManager()
self.downloader=html_downloader.HtmlDownloader()
self.parser=html_parser.HtmlParser()
self.outputer=html_outputer.HtmlOutputer()
def craw(self,root_url):
count=1
self.urls.add_new_url(root_url)
while self.urls.has_new_url():
try:
new_url=self.urls.get_new_url()
print 'craw %d:%s'%(count,new_url)
html_cont=self.downloader.download(new_url)
new_urls,new_data=self.parser.parse(new_url,html_cont)
self.urls.add_new_urls(new_urls)
self.outputer.collect_data(new_data)
if count==1000:
break
count=count+1
except:
print 'craw failed'
self.outputer.output_html()
-- D:\project_py\py_001\baike_spider\html_downloader.py
class HtmlDownloader(object):
def download(self):
pass
-- D:\project_py\py_001\baike_spider\html_outputer.py
class HtmlOutputer(object):
def collect_data(self):
pass
def output_html(self):
pass
-- D:\project_py\py_001\baike_spider\html_parser.py
class HtmlParser(object):
def parse(self):
pass
-- D:\project_py\py_001\baike_spider\url_manager.py
'''
Created on 2017年12月4日
@author: Administrator
'''
class UrlManager(object):
def add_new_url(self):
pass
def has_new_url(self):
pass
def get_new_url(self):
pass
def add_new_urls(self):
pass
相关文章推荐
- python_慕课\Python开发简单爬虫\7-7 开始运行爬虫和爬取结果展.py
- python利用urlib2进行简单爬虫实例
- python开发爬虫实例代码
- Python实现爬取知乎神回复简单爬虫代码分享
- python妹子图简单爬虫实例
- Python开发简单爬虫(笔记)
- 简单的python爬虫抓取图片实例
- python3简单爬虫实现代码
- python实现简单爬虫功能代码
- Python开发简单爬虫 - 慕课网
- Python开发简单爬虫(一)
- python开发简单爬虫:准备篇
- [C#]一步一步开发自己的自动代码生成工具之四:简单三层代码模板实例Model层
- Python 开发简单爬虫 学习笔记1
- Python开发简单爬虫
- python-利用pyaudio进行声音录制及简单实例代码分享
- 一则python3的简单爬虫代码
- Python开发简单爬虫学习笔记(1)
- Python实现爬取知乎神回复简单爬虫代码分享
- Python开发简单爬虫学习笔记(2)